diff --git a/args.c b/args.c
index 14b031040a4b70057e5fa9485c3d4e045c9842d0..51c0fb9c4ebf4818a07044f3b23b968dd6f7c6f4 100644
--- a/args.c
+++ b/args.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
@@ -22,42 +21,36 @@ extern void die(const char *fmt, ...) __attribute__((noreturn));
 extern void die(const char *fmt, ...);
 #endif
 
-
 struct arg arg_init(char **argv) {
   struct arg a;
 
-  a.argv      = argv;
+  a.argv = argv;
   a.argv_step = 1;
-  a.name      = NULL;
-  a.val       = NULL;
-  a.def       = NULL;
+  a.name = NULL;
+  a.val = NULL;
+  a.def = NULL;
   return a;
 }
 
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
   struct arg arg;
 
-  if (!argv[0] || argv[0][0] != '-')
-    return 0;
+  if (!argv[0] || argv[0][0] != '-') return 0;
 
   arg = arg_init(argv);
 
-  if (def->short_name
-      && strlen(arg.argv[0]) == strlen(def->short_name) + 1
-      && !strcmp(arg.argv[0] + 1, def->short_name)) {
-
+  if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 &&
+      !strcmp(arg.argv[0] + 1, def->short_name)) {
     arg.name = arg.argv[0] + 1;
     arg.val = def->has_val ? arg.argv[1] : NULL;
     arg.argv_step = def->has_val ? 2 : 1;
   } else if (def->long_name) {
     const size_t name_len = strlen(def->long_name);
 
-    if (strlen(arg.argv[0]) >= name_len + 2
-        && arg.argv[0][1] == '-'
-        && !strncmp(arg.argv[0] + 2, def->long_name, name_len)
-        && (arg.argv[0][name_len + 2] == '='
-            || arg.argv[0][name_len + 2] == '\0')) {
-
+    if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' &&
+        !strncmp(arg.argv[0] + 2, def->long_name, name_len) &&
+        (arg.argv[0][name_len + 2] == '=' ||
+         arg.argv[0][name_len + 2] == '\0')) {
       arg.name = arg.argv[0] + 2;
       arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL;
       arg.argv_step = 1;
@@ -70,8 +63,7 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
   if (arg.name && arg.val && !def->has_val)
     die("Error: option %s requires no argument.\n", arg.name);
 
-  if (arg.name
-      && (arg.val || !def->has_val)) {
+  if (arg.name && (arg.val || !def->has_val)) {
     arg.def = def;
     *arg_ = arg;
     return 1;
@@ -80,15 +72,12 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
   return 0;
 }
 
-
 const char *arg_next(struct arg *arg) {
-  if (arg->argv[0])
-    arg->argv += arg->argv_step;
+  if (arg->argv[0]) arg->argv += arg->argv_step;
 
   return *arg->argv;
 }
 
-
 char **argv_dup(int argc, const char **argv) {
   char **new_argv = malloc((argc + 1) * sizeof(*argv));
 
@@ -97,9 +86,8 @@ char **argv_dup(int argc, const char **argv) {
   return new_argv;
 }
 
-
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
-  char option_text[40] = {0};
+  char option_text[40] = { 0 };
 
   for (; *defs; defs++) {
     const struct arg_def *def = *defs;
@@ -109,15 +97,12 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
     if (def->short_name && def->long_name) {
       char *comma = def->has_val ? "," : ",      ";
 
-      snprintf(option_text, 37, "-%s%s%s --%s%6s",
-               def->short_name, short_val, comma,
-               def->long_name, long_val);
+      snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val,
+               comma, def->long_name, long_val);
     } else if (def->short_name)
-      snprintf(option_text, 37, "-%s%s",
-               def->short_name, short_val);
+      snprintf(option_text, 37, "-%s%s", def->short_name, short_val);
     else if (def->long_name)
-      snprintf(option_text, 37, "          --%s%s",
-               def->long_name, long_val);
+      snprintf(option_text, 37, "          --%s%s", def->long_name, long_val);
 
     fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
 
@@ -127,59 +112,53 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
       fprintf(fp, "  %-37s\t  ", "");
 
       for (listptr = def->enums; listptr->name; listptr++)
-        fprintf(fp, "%s%s", listptr->name,
-                listptr[1].name ? ", " : "\n");
+        fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n");
     }
   }
 }
 
-
 unsigned int arg_parse_uint(const struct arg *arg) {
-  long int   rawval;
-  char      *endptr;
+  long int rawval;
+  char *endptr;
 
   rawval = strtol(arg->val, &endptr, 10);
 
   if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    if (rawval >= 0 && rawval <= UINT_MAX)
-      return rawval;
+    if (rawval >= 0 && rawval <= UINT_MAX) return rawval;
 
-    die("Option %s: Value %ld out of range for unsigned int\n",
-        arg->name, rawval);
+    die("Option %s: Value %ld out of range for unsigned int\n", arg->name,
+        rawval);
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
   return 0;
 }
 
-
 int arg_parse_int(const struct arg *arg) {
-  long int   rawval;
-  char      *endptr;
+  long int rawval;
+  char *endptr;
 
   rawval = strtol(arg->val, &endptr, 10);
 
   if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    if (rawval >= INT_MIN && rawval <= INT_MAX)
-      return rawval;
+    if (rawval >= INT_MIN && rawval <= INT_MAX) return rawval;
 
-    die("Option %s: Value %ld out of range for signed int\n",
-        arg->name, rawval);
+    die("Option %s: Value %ld out of range for signed int\n", arg->name,
+        rawval);
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
   return 0;
 }
 
-
 struct vpx_rational {
   int num; /**< fraction numerator */
   int den; /**< fraction denominator */
 };
 struct vpx_rational arg_parse_rational(const struct arg *arg) {
-  long int             rawval;
-  char                *endptr;
-  struct vpx_rational  rat;
+  long int rawval;
+  char *endptr;
+  struct vpx_rational rat;
 
   /* parse numerator */
   rawval = strtol(arg->val, &endptr, 10);
@@ -187,9 +166,11 @@ struct vpx_rational arg_parse_rational(const struct arg *arg) {
   if (arg->val[0] != '\0' && endptr[0] == '/') {
     if (rawval >= INT_MIN && rawval <= INT_MAX)
       rat.num = rawval;
-    else die("Option %s: Value %ld out of range for signed int\n",
-               arg->name, rawval);
-  } else die("Option %s: Expected / at '%c'\n", arg->name, *endptr);
+    else
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+  } else
+    die("Option %s: Expected / at '%c'\n", arg->name, *endptr);
 
   /* parse denominator */
   rawval = strtol(endptr + 1, &endptr, 10);
@@ -197,40 +178,37 @@ struct vpx_rational arg_parse_rational(const struct arg *arg) {
   if (arg->val[0] != '\0' && endptr[0] == '\0') {
     if (rawval >= INT_MIN && rawval <= INT_MAX)
       rat.den = rawval;
-    else die("Option %s: Value %ld out of range for signed int\n",
-               arg->name, rawval);
-  } else die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+    else
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+  } else
+    die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
 
   return rat;
 }
 
-
 int arg_parse_enum(const struct arg *arg) {
   const struct arg_enum_list *listptr;
-  long int                    rawval;
-  char                       *endptr;
+  long int rawval;
+  char *endptr;
 
   /* First see if the value can be parsed as a raw value */
   rawval = strtol(arg->val, &endptr, 10);
   if (arg->val[0] != '\0' && endptr[0] == '\0') {
     /* Got a raw value, make sure it's valid */
     for (listptr = arg->def->enums; listptr->name; listptr++)
-      if (listptr->val == rawval)
-        return rawval;
+      if (listptr->val == rawval) return rawval;
   }
 
   /* Next see if it can be parsed as a string */
   for (listptr = arg->def->enums; listptr->name; listptr++)
-    if (!strcmp(arg->val, listptr->name))
-      return listptr->val;
+    if (!strcmp(arg->val, listptr->name)) return listptr->val;
 
   die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
   return 0;
 }
 
-
 int arg_parse_enum_or_int(const struct arg *arg) {
-  if (arg->def->enums)
-    return arg_parse_enum(arg);
+  if (arg->def->enums) return arg_parse_enum(arg);
   return arg_parse_int(arg);
 }
diff --git a/args.h b/args.h
index 1f37151a028681d9b0f215bb6975261e19e5fd1d..54abe04607d97903212315bd1a04981b5335728e 100644
--- a/args.h
+++ b/args.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef ARGS_H_
 #define ARGS_H_
 #include <stdio.h>
@@ -18,29 +17,33 @@ extern "C" {
 #endif
 
 struct arg {
-  char                 **argv;
-  const char            *name;
-  const char            *val;
-  unsigned int           argv_step;
-  const struct arg_def  *def;
+  char **argv;
+  const char *name;
+  const char *val;
+  unsigned int argv_step;
+  const struct arg_def *def;
 };
 
 struct arg_enum_list {
   const char *name;
-  int         val;
+  int val;
 };
-#define ARG_ENUM_LIST_END {0}
+#define ARG_ENUM_LIST_END \
+  { 0 }
 
 typedef struct arg_def {
   const char *short_name;
   const char *long_name;
-  int         has_val;
+  int has_val;
   const char *desc;
   const struct arg_enum_list *enums;
 } arg_def_t;
-#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL}
-#define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e}
-#define ARG_DEF_LIST_END {0}
+#define ARG_DEF(s, l, v, d) \
+  { s, l, v, d, NULL }
+#define ARG_DEF_ENUM(s, l, v, d, e) \
+  { s, l, v, d, e }
+#define ARG_DEF_LIST_END \
+  { 0 }
 
 struct arg arg_init(char **argv);
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index 1ae7a4b57f50292a9d369f60892a14d05ae0f7f2..51959f37df764adbcd01c5180e1445a9e1dbb594 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -65,8 +65,7 @@ static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
 static void print_md5(FILE *stream, unsigned char digest[16]) {
   int i;
 
-  for (i = 0; i < 16; ++i)
-    fprintf(stream, "%02x", digest[i]);
+  for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]);
 }
 
 static const char *exec_name;
@@ -86,12 +85,10 @@ int main(int argc, char **argv) {
 
   exec_name = argv[0];
 
-  if (argc != 3)
-    die("Invalid number of arguments.");
+  if (argc != 3) die("Invalid number of arguments.");
 
   reader = vpx_video_reader_open(argv[1]);
-  if (!reader)
-    die("Failed to open %s for reading.", argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
 
   if (!(outfile = fopen(argv[2], "wb")))
     die("Failed to open %s for writing.", argv[2]);
@@ -99,8 +96,7 @@ int main(int argc, char **argv) {
   info = vpx_video_reader_get_info(reader);
 
   decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc);
-  if (!decoder)
-    die("Unknown input codec.");
+  if (!decoder) die("Unknown input codec.");
 
   printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
 
@@ -111,8 +107,8 @@ int main(int argc, char **argv) {
     vpx_codec_iter_t iter = NULL;
     vpx_image_t *img = NULL;
     size_t frame_size = 0;
-    const unsigned char *frame = vpx_video_reader_get_frame(reader,
-                                                            &frame_size);
+    const unsigned char *frame =
+        vpx_video_reader_get_frame(reader, &frame_size);
     if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame");
 
@@ -121,14 +117,13 @@ int main(int argc, char **argv) {
 
       get_image_md5(img, digest);
       print_md5(outfile, digest);
-      fprintf(outfile, "  img-%dx%d-%04d.i420\n",
-              img->d_w, img->d_h, ++frame_cnt);
+      fprintf(outfile, "  img-%dx%d-%04d.i420\n", img->d_w, img->d_h,
+              ++frame_cnt);
     }
   }
 
   printf("Processed %d frames.\n", frame_cnt);
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec.");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
   vpx_video_reader_close(reader);
 
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index 2233e473d364c39cf5c104d828b5ae53787f027e..29b8be94131cd59fb8ab9729e9d9f5d6154ead8c 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -84,12 +84,10 @@ int main(int argc, char **argv) {
 
   exec_name = argv[0];
 
-  if (argc != 4)
-    die("Invalid number of arguments.");
+  if (argc != 4) die("Invalid number of arguments.");
 
   reader = vpx_video_reader_open(argv[1]);
-  if (!reader)
-    die("Failed to open %s for reading.", argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
 
   if (!(outfile = fopen(argv[2], "wb")))
     die("Failed to open %s for writing.", argv[2]);
@@ -103,8 +101,7 @@ int main(int argc, char **argv) {
   info = vpx_video_reader_get_info(reader);
 
   decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc);
-  if (!decoder)
-    die("Unknown input codec.");
+  if (!decoder) die("Unknown input codec.");
 
   printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
 
@@ -116,8 +113,8 @@ int main(int argc, char **argv) {
     vpx_image_t *img = NULL;
     size_t frame_size = 0;
     int skip;
-    const unsigned char *frame = vpx_video_reader_get_frame(reader,
-                                                            &frame_size);
+    const unsigned char *frame =
+        vpx_video_reader_get_frame(reader, &frame_size);
     if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame.");
 
@@ -139,8 +136,7 @@ int main(int argc, char **argv) {
   }
 
   printf("Processed %d frames.\n", frame_cnt);
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec.");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
   printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
          info->frame_width, info->frame_height, argv[2]);
diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index da2487820d23dfa1bf334ca38ba852ae3dfd364b..5c380a887c46f70927ea3f984ead18e6daf4124f 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c
@@ -21,32 +21,28 @@
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(stderr, "lossless_encoder: Example demonstrating lossless "
-                  "encoding feature. Supports raw input only.\n");
+  fprintf(stderr,
+          "lossless_encoder: Example demonstrating lossless "
+          "encoding feature. Supports raw input only.\n");
   fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
   exit(EXIT_FAILURE);
 }
 
-static int encode_frame(vpx_codec_ctx_t *codec,
-                        vpx_image_t *img,
-                        int frame_index,
-                        int flags,
-                        VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img,
+                        int frame_index, int flags, VpxVideoWriter *writer) {
   int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
-  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
-                                               flags, VPX_DL_GOOD_QUALITY);
-  if (res != VPX_CODEC_OK)
-    die_codec(codec, "Failed to encode frame");
+  const vpx_codec_err_t res =
+      vpx_codec_encode(codec, img, frame_index, 1, flags, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
     got_pkts = 1;
 
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
-      if (!vpx_video_writer_write_frame(writer,
-                                        pkt->data.frame.buf,
+      if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf,
                                         pkt->data.frame.sz,
                                         pkt->data.frame.pts)) {
         die_codec(codec, "Failed to write compressed frame");
@@ -66,19 +62,17 @@ int main(int argc, char **argv) {
   int frame_count = 0;
   vpx_image_t raw;
   vpx_codec_err_t res;
-  VpxVideoInfo info = {0};
+  VpxVideoInfo info = { 0 };
   VpxVideoWriter *writer = NULL;
   const VpxInterface *encoder = NULL;
   const int fps = 30;
 
   exec_name = argv[0];
 
-  if (argc < 5)
-    die("Invalid number of arguments");
+  if (argc < 5) die("Invalid number of arguments");
 
   encoder = get_vpx_encoder_by_name("vp9");
-  if (!encoder)
-     die("Unsupported codec.");
+  if (!encoder) die("Unsupported codec.");
 
   info.codec_fourcc = encoder->fourcc;
   info.frame_width = strtol(argv[1], NULL, 0);
@@ -86,23 +80,20 @@ int main(int argc, char **argv) {
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
-  if (info.frame_width <= 0 ||
-      info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 ||
-      (info.frame_height % 2) != 0) {
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
     die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
   }
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
-                                             info.frame_height, 1)) {
+                     info.frame_height, 1)) {
     die("Failed to allocate image.");
   }
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res)
-    die_codec(&codec, "Failed to get default codec config.");
+  if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
   cfg.g_h = info.frame_height;
@@ -110,8 +101,7 @@ int main(int argc, char **argv) {
   cfg.g_timebase.den = info.time_base.denominator;
 
   writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing.", argv[4]);
+  if (!writer) die("Failed to open %s for writing.", argv[4]);
 
   if (!(infile = fopen(argv[3], "rb")))
     die("Failed to open %s for reading.", argv[3]);
@@ -128,15 +118,15 @@ int main(int argc, char **argv) {
   }
 
   // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, 0, writer)) {}
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {
+  }
 
   printf("\n");
   fclose(infile);
   printf("Processed %d frames.\n", frame_count);
 
   vpx_img_free(&raw);
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec.");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
   vpx_video_writer_close(writer);
 
diff --git a/examples/resize_util.c b/examples/resize_util.c
index e6fdd5bb2af2bdd91e0974fc0dc5b90fd0b9e4b8..7e529b2e203123d7fb85de478b1c31416e79b7f8 100644
--- a/examples/resize_util.c
+++ b/examples/resize_util.c
@@ -34,10 +34,8 @@ void usage_exit(void) {
 
 static int parse_dim(char *v, int *width, int *height) {
   char *x = strchr(v, 'x');
-  if (x == NULL)
-    x = strchr(v, 'X');
-  if (x == NULL)
-    return 0;
+  if (x == NULL) x = strchr(v, 'X');
+  if (x == NULL) return 0;
   *width = atoi(v);
   *height = atoi(&x[1]);
   if (*width <= 0 || *height <= 0)
@@ -93,30 +91,25 @@ int main(int argc, char *argv[]) {
   else
     frames = INT_MAX;
 
-  printf("Input size:  %dx%d\n",
-         width, height);
-  printf("Target size: %dx%d, Frames: ",
-         target_width, target_height);
+  printf("Input size:  %dx%d\n", width, height);
+  printf("Target size: %dx%d, Frames: ", target_width, target_height);
   if (frames == INT_MAX)
     printf("All\n");
   else
     printf("%d\n", frames);
 
-  inbuf = (uint8_t*)malloc(width * height * 3 / 2);
-  outbuf = (uint8_t*)malloc(target_width * target_height * 3 / 2);
+  inbuf = (uint8_t *)malloc(width * height * 3 / 2);
+  outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
   inbuf_u = inbuf + width * height;
   inbuf_v = inbuf_u + width * height / 4;
   outbuf_u = outbuf + target_width * target_height;
   outbuf_v = outbuf_u + target_width * target_height / 4;
   f = 0;
   while (f < frames) {
-    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1)
-      break;
-    vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2,
-                        height, width,
-                        outbuf, target_width, outbuf_u, outbuf_v,
-                        target_width / 2,
-                        target_height, target_width);
+    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break;
+    vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
+                        width, outbuf, target_width, outbuf_u, outbuf_v,
+                        target_width / 2, target_height, target_width);
     fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
     f++;
   }
diff --git a/examples/set_maps.c b/examples/set_maps.c
index 1dc3ac0c98f8e1b75e6eff2372b64cd1c399f9af..d128e7d9a0d7822ef25be53510e8856a476db177 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 // VP8 Set Active and ROI Maps
 // ===========================
 //
@@ -86,8 +85,7 @@ static void set_roi_map(const vpx_codec_enc_cfg_t *cfg,
   roi.static_threshold[3] = 0;
 
   roi.roi_map = (uint8_t *)malloc(roi.rows * roi.cols);
-  for (i = 0; i < roi.rows * roi.cols; ++i)
-    roi.roi_map[i] = i % 4;
+  for (i = 0; i < roi.rows * roi.cols; ++i) roi.roi_map[i] = i % 4;
 
   if (vpx_codec_control(codec, VP8E_SET_ROI_MAP, &roi))
     die_codec(codec, "Failed to set ROI map");
@@ -98,14 +96,13 @@ static void set_roi_map(const vpx_codec_enc_cfg_t *cfg,
 static void set_active_map(const vpx_codec_enc_cfg_t *cfg,
                            vpx_codec_ctx_t *codec) {
   unsigned int i;
-  vpx_active_map_t map = {0, 0, 0};
+  vpx_active_map_t map = { 0, 0, 0 };
 
   map.rows = (cfg->g_h + 15) / 16;
   map.cols = (cfg->g_w + 15) / 16;
 
   map.active_map = (uint8_t *)malloc(map.rows * map.cols);
-  for (i = 0; i < map.rows * map.cols; ++i)
-    map.active_map[i] = i % 2;
+  for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2;
 
   if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map))
     die_codec(codec, "Failed to set active map");
@@ -115,7 +112,7 @@ static void set_active_map(const vpx_codec_enc_cfg_t *cfg,
 
 static void unset_active_map(const vpx_codec_enc_cfg_t *cfg,
                              vpx_codec_ctx_t *codec) {
-  vpx_active_map_t map = {0, 0, 0};
+  vpx_active_map_t map = { 0, 0, 0 };
 
   map.rows = (cfg->g_h + 15) / 16;
   map.cols = (cfg->g_w + 15) / 16;
@@ -125,25 +122,21 @@ static void unset_active_map(const vpx_codec_enc_cfg_t *cfg,
     die_codec(codec, "Failed to set active map");
 }
 
-static int encode_frame(vpx_codec_ctx_t *codec,
-                        vpx_image_t *img,
-                        int frame_index,
-                        VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img,
+                        int frame_index, VpxVideoWriter *writer) {
   int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
-  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
-                                               VPX_DL_GOOD_QUALITY);
-  if (res != VPX_CODEC_OK)
-    die_codec(codec, "Failed to encode frame");
+  const vpx_codec_err_t res =
+      vpx_codec_encode(codec, img, frame_index, 1, 0, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
     got_pkts = 1;
 
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
-      if (!vpx_video_writer_write_frame(writer,
-                                        pkt->data.frame.buf,
+      if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf,
                                         pkt->data.frame.sz,
                                         pkt->data.frame.pts)) {
         die_codec(codec, "Failed to write compressed frame");
@@ -167,12 +160,11 @@ int main(int argc, char **argv) {
   VpxVideoInfo info;
   VpxVideoWriter *writer = NULL;
   const VpxInterface *encoder = NULL;
-  const int fps = 2;        // TODO(dkovalev) add command line argument
+  const int fps = 2;  // TODO(dkovalev) add command line argument
   const double bits_per_pixel_per_frame = 0.067;
 
   exec_name = argv[0];
-  if (argc != 6)
-    die("Invalid number of arguments");
+  if (argc != 6) die("Invalid number of arguments");
 
   memset(&info, 0, sizeof(info));
 
@@ -187,35 +179,31 @@ int main(int argc, char **argv) {
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
-  if (info.frame_width <= 0 ||
-      info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 ||
-      (info.frame_height % 2) != 0) {
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
     die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
   }
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
-                                             info.frame_height, 1)) {
+                     info.frame_height, 1)) {
     die("Failed to allocate image.");
   }
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res)
-    die_codec(&codec, "Failed to get default codec config.");
+  if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
   cfg.g_h = info.frame_height;
   cfg.g_timebase.num = info.time_base.numerator;
   cfg.g_timebase.den = info.time_base.denominator;
-  cfg.rc_target_bitrate = (unsigned int)(bits_per_pixel_per_frame * cfg.g_w *
-                                         cfg.g_h * fps / 1000);
+  cfg.rc_target_bitrate =
+      (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000);
   cfg.g_lag_in_frames = 0;
 
   writer = vpx_video_writer_open(argv[5], kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing.", argv[5]);
+  if (!writer) die("Failed to open %s for writing.", argv[5]);
 
   if (!(infile = fopen(argv[4], "rb")))
     die("Failed to open %s for reading.", argv[4]);
@@ -239,15 +227,15 @@ int main(int argc, char **argv) {
   }
 
   // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, writer)) {}
+  while (encode_frame(&codec, NULL, -1, writer)) {
+  }
 
   printf("\n");
   fclose(infile);
   printf("Processed %d frames.\n", frame_count);
 
   vpx_img_free(&raw);
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec.");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
   vpx_video_writer_close(writer);
 
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index 8ccc81035e3ba649ca0e6eb216ebbf0896c6a0bc..2bb1a05245bd3abe27fa2e46cc061ba822b0a7eb 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 // Simple Decoder
 // ==============
 //
@@ -103,12 +102,10 @@ int main(int argc, char **argv) {
 
   exec_name = argv[0];
 
-  if (argc != 3)
-    die("Invalid number of arguments.");
+  if (argc != 3) die("Invalid number of arguments.");
 
   reader = vpx_video_reader_open(argv[1]);
-  if (!reader)
-    die("Failed to open %s for reading.", argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
 
   if (!(outfile = fopen(argv[2], "wb")))
     die("Failed to open %s for writing.", argv[2]);
@@ -116,8 +113,7 @@ int main(int argc, char **argv) {
   info = vpx_video_reader_get_info(reader);
 
   decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc);
-  if (!decoder)
-    die("Unknown input codec.");
+  if (!decoder) die("Unknown input codec.");
 
   printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
 
@@ -128,8 +124,8 @@ int main(int argc, char **argv) {
     vpx_codec_iter_t iter = NULL;
     vpx_image_t *img = NULL;
     size_t frame_size = 0;
-    const unsigned char *frame = vpx_video_reader_get_frame(reader,
-                                                            &frame_size);
+    const unsigned char *frame =
+        vpx_video_reader_get_frame(reader, &frame_size);
     if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0))
       die_codec(&codec, "Failed to decode frame.");
 
@@ -140,8 +136,7 @@ int main(int argc, char **argv) {
   }
 
   printf("Processed %d frames.\n", frame_cnt);
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
   printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
          info->frame_width, info->frame_height, argv[2]);
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 64f0a01379a1a3ea4cbd8ea667d02a4d4bcbdffd..331a2a595dd980d3d51fab6daa9fbc57bcd387dc 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -109,32 +109,27 @@ static const char *exec_name;
 void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <codec> <width> <height> <infile> <outfile> "
-              "<keyframe-interval> <error-resilient> <frames to encode>\n"
-              "See comments in simple_encoder.c for more information.\n",
+          "<keyframe-interval> <error-resilient> <frames to encode>\n"
+          "See comments in simple_encoder.c for more information.\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
 
-static int encode_frame(vpx_codec_ctx_t *codec,
-                        vpx_image_t *img,
-                        int frame_index,
-                        int flags,
-                        VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img,
+                        int frame_index, int flags, VpxVideoWriter *writer) {
   int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
-  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
-                                               flags, VPX_DL_GOOD_QUALITY);
-  if (res != VPX_CODEC_OK)
-    die_codec(codec, "Failed to encode frame");
+  const vpx_codec_err_t res =
+      vpx_codec_encode(codec, img, frame_index, 1, flags, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
     got_pkts = 1;
 
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
-      if (!vpx_video_writer_write_frame(writer,
-                                        pkt->data.frame.buf,
+      if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf,
                                         pkt->data.frame.sz,
                                         pkt->data.frame.pts)) {
         die_codec(codec, "Failed to write compressed frame");
@@ -155,7 +150,7 @@ int main(int argc, char **argv) {
   int frame_count = 0;
   vpx_image_t raw;
   vpx_codec_err_t res;
-  VpxVideoInfo info = {0};
+  VpxVideoInfo info = { 0 };
   VpxVideoWriter *writer = NULL;
   const VpxInterface *encoder = NULL;
   const int fps = 30;
@@ -172,8 +167,7 @@ int main(int argc, char **argv) {
 
   exec_name = argv[0];
 
-  if (argc != 9)
-    die("Invalid number of arguments");
+  if (argc != 9) die("Invalid number of arguments");
 
   codec_arg = argv[1];
   width_arg = argv[2];
@@ -184,8 +178,7 @@ int main(int argc, char **argv) {
   max_frames = strtol(argv[8], NULL, 0);
 
   encoder = get_vpx_encoder_by_name(codec_arg);
-  if (!encoder)
-     die("Unsupported codec.");
+  if (!encoder) die("Unsupported codec.");
 
   info.codec_fourcc = encoder->fourcc;
   info.frame_width = strtol(width_arg, NULL, 0);
@@ -193,27 +186,23 @@ int main(int argc, char **argv) {
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
-  if (info.frame_width <= 0 ||
-      info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 ||
-      (info.frame_height % 2) != 0) {
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
     die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
   }
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
-                                             info.frame_height, 1)) {
+                     info.frame_height, 1)) {
     die("Failed to allocate image.");
   }
 
   keyframe_interval = strtol(keyframe_interval_arg, NULL, 0);
-  if (keyframe_interval < 0)
-    die("Invalid keyframe interval value.");
+  if (keyframe_interval < 0) die("Invalid keyframe interval value.");
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res)
-    die_codec(&codec, "Failed to get default codec config.");
+  if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
   cfg.g_h = info.frame_height;
@@ -223,8 +212,7 @@ int main(int argc, char **argv) {
   cfg.g_error_resilient = strtol(argv[7], NULL, 0);
 
   writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing.", outfile_arg);
+  if (!writer) die("Failed to open %s for writing.", outfile_arg);
 
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading.", infile_arg);
@@ -239,20 +227,19 @@ int main(int argc, char **argv) {
       flags |= VPX_EFLAG_FORCE_KF;
     encode_frame(&codec, &raw, frame_count++, flags, writer);
     frames_encoded++;
-    if (max_frames > 0 && frames_encoded >= max_frames)
-      break;
+    if (max_frames > 0 && frames_encoded >= max_frames) break;
   }
 
   // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, 0, writer)) {};
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {
+  }
 
   printf("\n");
   fclose(infile);
   printf("Processed %d frames.\n", frame_count);
 
   vpx_img_free(&raw);
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec.");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
   vpx_video_writer_close(writer);
 
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 15a6617cd4f282a09ea046e652ac868ceff5c1b3..4c130ec18c3aef24d51ec5c6442338a8fa2fc8eb 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -61,25 +61,21 @@ static const char *exec_name;
 void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <codec> <width> <height> <infile> <outfile> "
-              "<frame limit>\n",
+          "<frame limit>\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
 
-static int get_frame_stats(vpx_codec_ctx_t *ctx,
-                           const vpx_image_t *img,
-                           vpx_codec_pts_t pts,
-                           unsigned int duration,
-                           vpx_enc_frame_flags_t flags,
-                           unsigned int deadline,
+static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
+                           vpx_codec_pts_t pts, unsigned int duration,
+                           vpx_enc_frame_flags_t flags, unsigned int deadline,
                            vpx_fixed_buf_t *stats) {
   int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
-  const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
-                                               deadline);
-  if (res != VPX_CODEC_OK)
-    die_codec(ctx, "Failed to get frame stats.");
+  const vpx_codec_err_t res =
+      vpx_codec_encode(ctx, img, pts, duration, flags, deadline);
+  if (res != VPX_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
     got_pkts = 1;
@@ -96,20 +92,16 @@ static int get_frame_stats(vpx_codec_ctx_t *ctx,
   return got_pkts;
 }
 
-static int encode_frame(vpx_codec_ctx_t *ctx,
-                        const vpx_image_t *img,
-                        vpx_codec_pts_t pts,
-                        unsigned int duration,
-                        vpx_enc_frame_flags_t flags,
-                        unsigned int deadline,
+static int encode_frame(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
+                        vpx_codec_pts_t pts, unsigned int duration,
+                        vpx_enc_frame_flags_t flags, unsigned int deadline,
                         VpxVideoWriter *writer) {
   int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
-  const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
-                                               deadline);
-  if (res != VPX_CODEC_OK)
-    die_codec(ctx, "Failed to encode frame.");
+  const vpx_codec_err_t res =
+      vpx_codec_encode(ctx, img, pts, duration, flags, deadline);
+  if (res != VPX_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
 
   while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
     got_pkts = 1;
@@ -117,8 +109,8 @@ static int encode_frame(vpx_codec_ctx_t *ctx,
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
 
       if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf,
-                                                pkt->data.frame.sz,
-                                                pkt->data.frame.pts))
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts))
         die_codec(ctx, "Failed to write compressed frame.");
       printf(keyframe ? "K" : ".");
       fflush(stdout);
@@ -128,14 +120,12 @@ static int encode_frame(vpx_codec_ctx_t *ctx,
   return got_pkts;
 }
 
-static vpx_fixed_buf_t pass0(vpx_image_t *raw,
-                             FILE *infile,
+static vpx_fixed_buf_t pass0(vpx_image_t *raw, FILE *infile,
                              const VpxInterface *encoder,
-                             const vpx_codec_enc_cfg_t *cfg,
-                             int max_frames) {
+                             const vpx_codec_enc_cfg_t *cfg, int max_frames) {
   vpx_codec_ctx_t codec;
   int frame_count = 0;
-  vpx_fixed_buf_t stats = {NULL, 0};
+  vpx_fixed_buf_t stats = { NULL, 0 };
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
@@ -145,40 +135,33 @@ static vpx_fixed_buf_t pass0(vpx_image_t *raw,
     ++frame_count;
     get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
                     &stats);
-    if (max_frames > 0 && frame_count >= max_frames)
-      break;
+    if (max_frames > 0 && frame_count >= max_frames) break;
   }
 
   // Flush encoder.
-  while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
-                         VPX_DL_GOOD_QUALITY, &stats)) {}
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
+                         &stats)) {
+  }
 
   printf("Pass 0 complete. Processed %d frames.\n", frame_count);
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec.");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
   return stats;
 }
 
-static void pass1(vpx_image_t *raw,
-                  FILE *infile,
-                  const char *outfile_name,
-                  const VpxInterface *encoder,
-                  const vpx_codec_enc_cfg_t *cfg,
+static void pass1(vpx_image_t *raw, FILE *infile, const char *outfile_name,
+                  const VpxInterface *encoder, const vpx_codec_enc_cfg_t *cfg,
                   int max_frames) {
-  VpxVideoInfo info = {
-    encoder->fourcc,
-    cfg->g_w,
-    cfg->g_h,
-    {cfg->g_timebase.num, cfg->g_timebase.den}
-  };
+  VpxVideoInfo info = { encoder->fourcc,
+                        cfg->g_w,
+                        cfg->g_h,
+                        { cfg->g_timebase.num, cfg->g_timebase.den } };
   VpxVideoWriter *writer = NULL;
   vpx_codec_ctx_t codec;
   int frame_count = 0;
 
   writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing", outfile_name);
+  if (!writer) die("Failed to open %s for writing", outfile_name);
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
@@ -188,17 +171,16 @@ static void pass1(vpx_image_t *raw,
     ++frame_count;
     encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
 
-    if (max_frames > 0 && frame_count >= max_frames)
-      break;
+    if (max_frames > 0 && frame_count >= max_frames) break;
   }
 
   // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {}
+  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {
+  }
 
   printf("\n");
 
-  if (vpx_codec_destroy(&codec))
-    die_codec(&codec, "Failed to destroy codec.");
+  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
 
   vpx_video_writer_close(writer);
 
@@ -215,8 +197,8 @@ int main(int argc, char **argv) {
   vpx_fixed_buf_t stats;
 
   const VpxInterface *encoder = NULL;
-  const int fps = 30;        // TODO(dkovalev) add command line argument
-  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+  const int fps = 30;       // TODO(dkovalev) add command line argument
+  const int bitrate = 200;  // kbit/s TODO(dkovalev) add command line argument
   const char *const codec_arg = argv[1];
   const char *const width_arg = argv[2];
   const char *const height_arg = argv[3];
@@ -225,19 +207,17 @@ int main(int argc, char **argv) {
   int max_frames = 0;
   exec_name = argv[0];
 
-  if (argc != 7)
-    die("Invalid number of arguments.");
+  if (argc != 7) die("Invalid number of arguments.");
 
   max_frames = strtol(argv[6], NULL, 0);
 
   encoder = get_vpx_encoder_by_name(codec_arg);
-  if (!encoder)
-    die("Unsupported codec.");
+  if (!encoder) die("Unsupported codec.");
 
   w = strtol(width_arg, NULL, 0);
   h = strtol(height_arg, NULL, 0);
 
-  if (w  <= 0 || h <= 0 || (w % 2) != 0 || (h  % 2) != 0)
+  if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
     die("Invalid frame size: %dx%d", w, h);
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
@@ -247,8 +227,7 @@ int main(int argc, char **argv) {
 
   // Configuration
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res)
-    die_codec(&codec, "Failed to get default codec config.");
+  if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = w;
   cfg.g_h = h;
diff --git a/examples/vpxcx_set_ref.c b/examples/vpxcx_set_ref.c
index 25164857c9ed7eb953e177493f9ee33d7c095a88..6771d422e1a5de8a7fbb48c381cff86ea52292d9 100644
--- a/examples/vpxcx_set_ref.c
+++ b/examples/vpxcx_set_ref.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 // VP10 Set Reference Frame
 // ============================
 //
@@ -61,7 +60,8 @@
 static const char *exec_name;
 
 void usage_exit() {
-  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile> "
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
           "<frame> <limit(optional)>\n",
           exec_name);
   exit(EXIT_FAILURE);
@@ -70,8 +70,7 @@ void usage_exit() {
 static int compare_img(const vpx_image_t *const img1,
                        const vpx_image_t *const img2) {
   uint32_t l_w = img1->d_w;
-  uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
   const uint32_t c_h =
       (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
   uint32_t i;
@@ -99,10 +98,10 @@ static int compare_img(const vpx_image_t *const img1,
   return match;
 }
 
-#define mmin(a, b)  ((a) < (b) ? (a) : (b))
+#define mmin(a, b) ((a) < (b) ? (a) : (b))
 static void find_mismatch(const vpx_image_t *const img1,
-                          const vpx_image_t *const img2,
-                          int yloc[4], int uloc[4], int vloc[4]) {
+                          const vpx_image_t *const img2, int yloc[4],
+                          int uloc[4], int vloc[4]) {
   const uint32_t bsize = 64;
   const uint32_t bsizey = bsize >> img1->y_chroma_shift;
   const uint32_t bsizex = bsize >> img1->x_chroma_shift;
@@ -190,21 +189,18 @@ static void find_mismatch(const vpx_image_t *const img1,
   }
 }
 
-static void testing_decode(vpx_codec_ctx_t *encoder,
-                           vpx_codec_ctx_t *decoder,
-                           vpx_codec_enc_cfg_t *cfg,
-                           unsigned int frame_out,
+static void testing_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder,
+                           vpx_codec_enc_cfg_t *cfg, unsigned int frame_out,
                            int *mismatch_seen) {
   vpx_image_t enc_img, dec_img;
   struct vp9_ref_frame ref_enc, ref_dec;
 
-  if (*mismatch_seen)
-    return;
+  if (*mismatch_seen) return;
 
   ref_enc.idx = 0;
   ref_dec.idx = 0;
   if (vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc))
-    die_codec(encoder,  "Failed to get encoder reference frame");
+    die_codec(encoder, "Failed to get encoder reference frame");
   enc_img = ref_enc.img;
   if (vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec))
     die_codec(decoder, "Failed to get decoder reference frame");
@@ -216,37 +212,31 @@ static void testing_decode(vpx_codec_ctx_t *encoder,
     *mismatch_seen = 1;
 
     find_mismatch(&enc_img, &dec_img, y, u, v);
-    printf("Encode/decode mismatch on frame %d at"
-           " Y[%d, %d] {%d/%d},"
-           " U[%d, %d] {%d/%d},"
-           " V[%d, %d] {%d/%d}",
-           frame_out,
-           y[0], y[1], y[2], y[3],
-           u[0], u[1], u[2], u[3],
-           v[0], v[1], v[2], v[3]);
+    printf(
+        "Encode/decode mismatch on frame %d at"
+        " Y[%d, %d] {%d/%d},"
+        " U[%d, %d] {%d/%d},"
+        " V[%d, %d] {%d/%d}",
+        frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+        v[2], v[3]);
   }
 
   vpx_img_free(&enc_img);
   vpx_img_free(&dec_img);
 }
 
-static int encode_frame(vpx_codec_ctx_t *ecodec,
-                        vpx_codec_enc_cfg_t *cfg,
-                        vpx_image_t *img,
-                        unsigned int frame_in,
-                        VpxVideoWriter *writer,
-                        int test_decode,
-                        vpx_codec_ctx_t *dcodec,
-                        unsigned int *frame_out,
+static int encode_frame(vpx_codec_ctx_t *ecodec, vpx_codec_enc_cfg_t *cfg,
+                        vpx_image_t *img, unsigned int frame_in,
+                        VpxVideoWriter *writer, int test_decode,
+                        vpx_codec_ctx_t *dcodec, unsigned int *frame_out,
                         int *mismatch_seen) {
   int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   int got_data;
-  const vpx_codec_err_t res = vpx_codec_encode(ecodec, img, frame_in, 1,
-                                               0, VPX_DL_GOOD_QUALITY);
-  if (res != VPX_CODEC_OK)
-    die_codec(ecodec, "Failed to encode frame");
+  const vpx_codec_err_t res =
+      vpx_codec_encode(ecodec, img, frame_in, 1, 0, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK) die_codec(ecodec, "Failed to encode frame");
 
   got_data = 0;
 
@@ -257,11 +247,10 @@ static int encode_frame(vpx_codec_ctx_t *ecodec,
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
 
       if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
-                *frame_out += 1;
-        }
+        *frame_out += 1;
+      }
 
-      if (!vpx_video_writer_write_frame(writer,
-                                        pkt->data.frame.buf,
+      if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf,
                                         pkt->data.frame.sz,
                                         pkt->data.frame.pts)) {
         die_codec(ecodec, "Failed to write compressed frame");
@@ -290,12 +279,12 @@ static int encode_frame(vpx_codec_ctx_t *ecodec,
 int main(int argc, char **argv) {
   FILE *infile = NULL;
   // Encoder
-  vpx_codec_ctx_t ecodec = {0};
-  vpx_codec_enc_cfg_t cfg = {0};
+  vpx_codec_ctx_t ecodec = { 0 };
+  vpx_codec_enc_cfg_t cfg = { 0 };
   unsigned int frame_in = 0;
   vpx_image_t raw;
   vpx_codec_err_t res;
-  VpxVideoInfo info = {0};
+  VpxVideoInfo info = { 0 };
   VpxVideoWriter *writer = NULL;
   const VpxInterface *encoder = NULL;
 
@@ -320,8 +309,7 @@ int main(int argc, char **argv) {
   unsigned int limit = 0;
   exec_name = argv[0];
 
-  if (argc < 7)
-    die("Invalid number of arguments");
+  if (argc < 7) die("Invalid number of arguments");
 
   codec_arg = argv[1];
   width_arg = argv[2];
@@ -330,15 +318,13 @@ int main(int argc, char **argv) {
   outfile_arg = argv[5];
 
   encoder = get_vpx_encoder_by_name(codec_arg);
-  if (!encoder)
-    die("Unsupported codec.");
+  if (!encoder) die("Unsupported codec.");
 
   update_frame_num = atoi(argv[6]);
   // In VP10, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
   // allocated while calling vpx_codec_encode(), thus, setting reference for
   // 1st frame isn't supported.
-  if (update_frame_num <= 1)
-    die("Couldn't parse frame number '%s'\n", argv[6]);
+  if (update_frame_num <= 1) die("Couldn't parse frame number '%s'\n", argv[6]);
 
   if (argc > 7) {
     limit = atoi(argv[7]);
@@ -352,23 +338,20 @@ int main(int argc, char **argv) {
   info.time_base.numerator = 1;
   info.time_base.denominator = fps;
 
-  if (info.frame_width <= 0 ||
-      info.frame_height <= 0 ||
-      (info.frame_width % 2) != 0 ||
-      (info.frame_height % 2) != 0) {
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
     die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
   }
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
-                                             info.frame_height, 1)) {
+                     info.frame_height, 1)) {
     die("Failed to allocate image.");
   }
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
   res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
-  if (res)
-    die_codec(&ecodec, "Failed to get default codec config.");
+  if (res) die_codec(&ecodec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
   cfg.g_h = info.frame_height;
@@ -378,8 +361,7 @@ int main(int argc, char **argv) {
   cfg.g_lag_in_frames = 3;
 
   writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
-  if (!writer)
-    die("Failed to open %s for writing.", outfile_arg);
+  if (!writer) die("Failed to open %s for writing.", outfile_arg);
 
   if (!(infile = fopen(infile_arg, "rb")))
     die("Failed to open %s for reading.", infile_arg);
@@ -392,15 +374,14 @@ int main(int argc, char **argv) {
     die_codec(&ecodec, "Failed to set enable auto alt ref");
 
   if (test_decode) {
-      const VpxInterface *decoder = get_vpx_decoder_by_name(codec_arg);
-      if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
-        die_codec(&dcodec, "Failed to initialize decoder.");
+    const VpxInterface *decoder = get_vpx_decoder_by_name(codec_arg);
+    if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
+      die_codec(&dcodec, "Failed to initialize decoder.");
   }
 
   // Encode frames.
   while (vpx_img_read(&raw, infile)) {
-    if (limit && frame_in >= limit)
-      break;
+    if (limit && frame_in >= limit) break;
     if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
       vpx_ref_frame_t ref;
       ref.frame_type = VP8_LAST_FRAME;
@@ -418,17 +399,17 @@ int main(int argc, char **argv) {
       }
     }
 
-    encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode,
-                 &dcodec, &frame_out, &mismatch_seen);
+    encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode, &dcodec,
+                 &frame_out, &mismatch_seen);
     frame_in++;
-    if (mismatch_seen)
-      break;
+    if (mismatch_seen) break;
   }
 
   // Flush encoder.
   if (!mismatch_seen)
     while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode,
-                        &dcodec, &frame_out, &mismatch_seen)) {}
+                        &dcodec, &frame_out, &mismatch_seen)) {
+    }
 
   printf("\n");
   fclose(infile);
diff --git a/ivfdec.c b/ivfdec.c
index 7fc25a0e8105e6e2a96cc5026b0042ab54cff286..f64e594ab0e6952a83363518fe0175672196d452 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -46,7 +46,8 @@ int file_is_ivf(struct VpxInputContext *input_ctx) {
       is_ivf = 1;
 
       if (mem_get_le16(raw_hdr + 4) != 0) {
-        fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
+        fprintf(stderr,
+                "Error: Unrecognized IVF version! This file may not"
                 " decode properly.");
       }
 
@@ -69,14 +70,13 @@ int file_is_ivf(struct VpxInputContext *input_ctx) {
   return is_ivf;
 }
 
-int ivf_read_frame(FILE *infile, uint8_t **buffer,
-                   size_t *bytes_read, size_t *buffer_size) {
-  char raw_header[IVF_FRAME_HDR_SZ] = {0};
+int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                   size_t *buffer_size) {
+  char raw_header[IVF_FRAME_HDR_SZ] = { 0 };
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile))
-      warn("Failed to read frame size\n");
+    if (!feof(infile)) warn("Failed to read frame size\n");
   } else {
     frame_size = mem_get_le32(raw_header);
 
diff --git a/ivfdec.h b/ivfdec.h
index dd29cc6174b73d579ee3c4e15e372e29d73cc9a6..af725572b48dcfcee53c63c186799321ccdae5e2 100644
--- a/ivfdec.h
+++ b/ivfdec.h
@@ -18,11 +18,11 @@ extern "C" {
 
 int file_is_ivf(struct VpxInputContext *input);
 
-int ivf_read_frame(FILE *infile, uint8_t **buffer,
-                   size_t *bytes_read, size_t *buffer_size);
+int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                   size_t *buffer_size);
 
 #ifdef __cplusplus
-}  /* extern "C" */
+} /* extern "C" */
 #endif
 
 #endif  // IVFDEC_H_
diff --git a/ivfenc.c b/ivfenc.c
index 4a97c42731c93523379fcd2cb80c2bc5524c3722..a50d31839da01f09f5bf1e2284a3373222e2cc52 100644
--- a/ivfenc.c
+++ b/ivfenc.c
@@ -13,10 +13,8 @@
 #include "vpx/vpx_encoder.h"
 #include "vpx_ports/mem_ops.h"
 
-void ivf_write_file_header(FILE *outfile,
-                           const struct vpx_codec_enc_cfg *cfg,
-                           unsigned int fourcc,
-                           int frame_cnt) {
+void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg,
+                           unsigned int fourcc, int frame_cnt) {
   char header[32];
 
   header[0] = 'D';
diff --git a/ivfenc.h b/ivfenc.h
index 6623687e8444061081a8393c578b3c794f1b7426..ebdce47be8f659f1e36115c1880f33c8dd53c6d2 100644
--- a/ivfenc.h
+++ b/ivfenc.h
@@ -19,17 +19,15 @@ struct vpx_codec_cx_pkt;
 extern "C" {
 #endif
 
-void ivf_write_file_header(FILE *outfile,
-                           const struct vpx_codec_enc_cfg *cfg,
-                           uint32_t fourcc,
-                           int frame_cnt);
+void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg,
+                           uint32_t fourcc, int frame_cnt);
 
 void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size);
 
 void ivf_write_frame_size(FILE *outfile, size_t frame_size);
 
 #ifdef __cplusplus
-}  /* extern "C" */
+} /* extern "C" */
 #endif
 
 #endif  // IVFENC_H_
diff --git a/md5_utils.c b/md5_utils.c
index a9b979a419787cfc5211111c6c0aefe79104a060..093798b833983e57d2fdcd7a1bc9a4839a9b496f 100644
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -20,19 +20,17 @@
  * Still in the public domain.
  */
 
-#include <string.h>   /* for memcpy() */
+#include <string.h> /* for memcpy() */
 
 #include "md5_utils.h"
 
-static void
-byteSwap(UWORD32 *buf, unsigned words) {
+static void byteSwap(UWORD32 *buf, unsigned words) {
   md5byte *p;
 
   /* Only swap bytes for big endian machines */
   int i = 1;
 
-  if (*(char *)&i == 1)
-    return;
+  if (*(char *)&i == 1) return;
 
   p = (md5byte *)buf;
 
@@ -47,8 +45,7 @@ byteSwap(UWORD32 *buf, unsigned words) {
  * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
  * initialization constants.
  */
-void
-MD5Init(struct MD5Context *ctx) {
+void MD5Init(struct MD5Context *ctx) {
   ctx->buf[0] = 0x67452301;
   ctx->buf[1] = 0xefcdab89;
   ctx->buf[2] = 0x98badcfe;
@@ -62,8 +59,7 @@ MD5Init(struct MD5Context *ctx) {
  * Update context to reflect the concatenation of another buffer full
  * of bytes.
  */
-void
-MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
   UWORD32 t;
 
   /* Update byte count */
@@ -71,9 +67,9 @@ MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
   t = ctx->bytes[0];
 
   if ((ctx->bytes[0] = t + len) < t)
-    ctx->bytes[1]++;  /* Carry from low to high */
+    ctx->bytes[1]++; /* Carry from low to high */
 
-  t = 64 - (t & 0x3f);  /* Space available in ctx->in (at least 1) */
+  t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
 
   if (t > len) {
     memcpy((md5byte *)ctx->in + 64 - t, buf, len);
@@ -104,8 +100,7 @@ MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
  * Final wrapup - pad to 64-byte boundary with the bit pattern
  * 1 0* (64-bit count of bits processed, MSB-first)
  */
-void
-MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
   int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
   md5byte *p = (md5byte *)ctx->in + count;
 
@@ -115,7 +110,7 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) {
   /* Bytes of padding needed to make 56 bytes (-8..55) */
   count = 56 - 1 - count;
 
-  if (count < 0) {  /* Padding forces an extra block */
+  if (count < 0) { /* Padding forces an extra block */
     memset(p, 0, count + 8);
     byteSwap(ctx->in, 16);
     MD5Transform(ctx->buf, ctx->in);
@@ -147,8 +142,8 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) {
 #define F4(x, y, z) (y ^ (x | ~z))
 
 /* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f,w,x,y,z,in,s) \
-  (w += f(x,y,z) + in, w = (w<<s | w>>(32-s)) + x)
+#define MD5STEP(f, w, x, y, z, in, s) \
+  (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
 
 #if defined(__clang__) && defined(__has_attribute)
 #if __has_attribute(no_sanitize)
@@ -166,8 +161,8 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) {
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-VPX_NO_UNSIGNED_OVERFLOW_CHECK void
-MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) {
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+                                                 UWORD32 const in[16]) {
   register UWORD32 a, b, c, d;
 
   a = buf[0];
diff --git a/rate_hist.c b/rate_hist.c
index a77222b16186644caceb6d0ba2c5c08eeee76329..872a10bae0b3b69d95083a169f22905a770788a8 100644
--- a/rate_hist.c
+++ b/rate_hist.c
@@ -45,8 +45,7 @@ struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
   hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
 
   // prevent division by zero
-  if (hist->samples == 0)
-    hist->samples = 1;
+  if (hist->samples == 0) hist->samples = 1;
 
   hist->frames = 0;
   hist->total = 0;
@@ -78,18 +77,16 @@ void update_rate_histogram(struct rate_hist *hist,
   int64_t avg_bitrate = 0;
   int64_t sum_sz = 0;
   const int64_t now = pkt->data.frame.pts * 1000 *
-                          (uint64_t)cfg->g_timebase.num /
-                              (uint64_t)cfg->g_timebase.den;
+                      (uint64_t)cfg->g_timebase.num /
+                      (uint64_t)cfg->g_timebase.den;
 
   int idx = hist->frames++ % hist->samples;
   hist->pts[idx] = now;
   hist->sz[idx] = (int)pkt->data.frame.sz;
 
-  if (now < cfg->rc_buf_initial_sz)
-    return;
+  if (now < cfg->rc_buf_initial_sz) return;
 
-  if (!cfg->rc_target_bitrate)
-    return;
+  if (!cfg->rc_target_bitrate) return;
 
   then = now;
 
@@ -98,20 +95,16 @@ void update_rate_histogram(struct rate_hist *hist,
     const int i_idx = (i - 1) % hist->samples;
 
     then = hist->pts[i_idx];
-    if (now - then > cfg->rc_buf_sz)
-      break;
+    if (now - then > cfg->rc_buf_sz) break;
     sum_sz += hist->sz[i_idx];
   }
 
-  if (now == then)
-    return;
+  if (now == then) return;
 
   avg_bitrate = sum_sz * 8 * 1000 / (now - then);
   idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000));
-  if (idx < 0)
-    idx = 0;
-  if (idx > RATE_BINS - 1)
-    idx = RATE_BINS - 1;
+  if (idx < 0) idx = 0;
+  if (idx > RATE_BINS - 1) idx = RATE_BINS - 1;
   if (hist->bucket[idx].low > avg_bitrate)
     hist->bucket[idx].low = (int)avg_bitrate;
   if (hist->bucket[idx].high < avg_bitrate)
@@ -120,8 +113,8 @@ void update_rate_histogram(struct rate_hist *hist,
   hist->total++;
 }
 
-static int merge_hist_buckets(struct hist_bucket *bucket,
-                              int max_buckets, int *num_buckets) {
+static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
+                              int *num_buckets) {
   int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
   int buckets = *num_buckets;
   int i;
@@ -129,10 +122,8 @@ static int merge_hist_buckets(struct hist_bucket *bucket,
   /* Find the extrema for this list of buckets */
   big_bucket = small_bucket = 0;
   for (i = 0; i < buckets; i++) {
-    if (bucket[i].count < bucket[small_bucket].count)
-      small_bucket = i;
-    if (bucket[i].count > bucket[big_bucket].count)
-      big_bucket = i;
+    if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+    if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
   }
 
   /* If we have too many buckets, merge the smallest with an adjacent
@@ -174,13 +165,10 @@ static int merge_hist_buckets(struct hist_bucket *bucket,
      */
     big_bucket = small_bucket = 0;
     for (i = 0; i < buckets; i++) {
-      if (i > merge_bucket)
-        bucket[i] = bucket[i + 1];
+      if (i > merge_bucket) bucket[i] = bucket[i + 1];
 
-      if (bucket[i].count < bucket[small_bucket].count)
-        small_bucket = i;
-      if (bucket[i].count > bucket[big_bucket].count)
-        big_bucket = i;
+      if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+      if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
     }
   }
 
@@ -188,8 +176,8 @@ static int merge_hist_buckets(struct hist_bucket *bucket,
   return bucket[big_bucket].count;
 }
 
-static void show_histogram(const struct hist_bucket *bucket,
-                           int buckets, int total, int scale) {
+static void show_histogram(const struct hist_bucket *bucket, int buckets,
+                           int total, int scale) {
   const char *pat1, *pat2;
   int i;
 
@@ -232,8 +220,7 @@ static void show_histogram(const struct hist_bucket *bucket,
 
     pct = (float)(100.0 * bucket[i].count / total);
     len = HIST_BAR_MAX * bucket[i].count / scale;
-    if (len < 1)
-      len = 1;
+    if (len < 1) len = 1;
     assert(len <= HIST_BAR_MAX);
 
     if (bucket[i].low == bucket[i].high)
@@ -241,8 +228,7 @@ static void show_histogram(const struct hist_bucket *bucket,
     else
       fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
 
-    for (j = 0; j < HIST_BAR_MAX; j++)
-      fprintf(stderr, j < len ? "=" : " ");
+    for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
     fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
   }
 }
@@ -268,14 +254,13 @@ void show_q_histogram(const int counts[64], int max_buckets) {
   show_histogram(bucket, buckets, total, scale);
 }
 
-void show_rate_histogram(struct rate_hist *hist,
-                         const vpx_codec_enc_cfg_t *cfg, int max_buckets) {
+void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
+                         int max_buckets) {
   int i, scale;
   int buckets = 0;
 
   for (i = 0; i < RATE_BINS; i++) {
-    if (hist->bucket[i].low == INT_MAX)
-      continue;
+    if (hist->bucket[i].low == INT_MAX) continue;
     hist->bucket[buckets++] = hist->bucket[i];
   }
 
diff --git a/tools_common.c b/tools_common.c
index 83eec5013f5a3d6b20b393fdb82f234752e0f68b..e1c89a4cf651490e1d5b5f1c462e67944eb52a1b 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -29,23 +29,22 @@
 #include <fcntl.h>
 
 #ifdef __OS2__
-#define _setmode    setmode
-#define _fileno     fileno
-#define _O_BINARY   O_BINARY
+#define _setmode setmode
+#define _fileno fileno
+#define _O_BINARY O_BINARY
 #endif
 #endif
 
-#define LOG_ERROR(label) do {\
-  const char *l = label;\
-  va_list ap;\
-  va_start(ap, fmt);\
-  if (l)\
-    fprintf(stderr, "%s: ", l);\
-  vfprintf(stderr, fmt, ap);\
-  fprintf(stderr, "\n");\
-  va_end(ap);\
-} while (0)
-
+#define LOG_ERROR(label)               \
+  do {                                 \
+    const char *l = label;             \
+    va_list ap;                        \
+    va_start(ap, fmt);                 \
+    if (l) fprintf(stderr, "%s: ", l); \
+    vfprintf(stderr, fmt, ap);         \
+    fprintf(stderr, "\n");             \
+    va_end(ap);                        \
+  } while (0)
 
 FILE *set_binary_mode(FILE *stream) {
   (void)stream;
@@ -65,16 +64,13 @@ void fatal(const char *fmt, ...) {
   exit(EXIT_FAILURE);
 }
 
-void warn(const char *fmt, ...) {
-  LOG_ERROR("Warning");
-}
+void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
 
 void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   const char *detail = vpx_codec_error_detail(ctx);
 
   printf("%s: %s\n", s, vpx_codec_error(ctx));
-  if (detail)
-    printf("    %s\n", detail);
+  if (detail) printf("    %s\n", detail);
   exit(EXIT_FAILURE);
 }
 
@@ -97,15 +93,16 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
      */
     switch (plane) {
       case 1:
-        ptr = yuv_frame->planes[
-            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V : VPX_PLANE_U];
+        ptr =
+            yuv_frame->planes[yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V
+                                                                 : VPX_PLANE_U];
         break;
       case 2:
-        ptr = yuv_frame->planes[
-            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U : VPX_PLANE_V];
+        ptr =
+            yuv_frame->planes[yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U
+                                                                 : VPX_PLANE_V];
         break;
-      default:
-        ptr = yuv_frame->planes[plane];
+      default: ptr = yuv_frame->planes[plane];
     }
 
     for (r = 0; r < h; ++r) {
@@ -134,7 +131,7 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
 
 static const VpxInterface vpx_encoders[] = {
 #if CONFIG_VP10_ENCODER
-  {"vp10", VP10_FOURCC, &vpx_codec_vp10_cx},
+  { "vp10", VP10_FOURCC, &vpx_codec_vp10_cx },
 #endif
 };
 
@@ -142,17 +139,14 @@ int get_vpx_encoder_count(void) {
   return sizeof(vpx_encoders) / sizeof(vpx_encoders[0]);
 }
 
-const VpxInterface *get_vpx_encoder_by_index(int i) {
-  return &vpx_encoders[i];
-}
+const VpxInterface *get_vpx_encoder_by_index(int i) { return &vpx_encoders[i]; }
 
 const VpxInterface *get_vpx_encoder_by_name(const char *name) {
   int i;
 
   for (i = 0; i < get_vpx_encoder_count(); ++i) {
     const VpxInterface *encoder = get_vpx_encoder_by_index(i);
-    if (strcmp(encoder->name, name) == 0)
-      return encoder;
+    if (strcmp(encoder->name, name) == 0) return encoder;
   }
 
   return NULL;
@@ -165,7 +159,7 @@ const VpxInterface *get_vpx_encoder_by_name(const char *name) {
 static const VpxInterface vpx_decoders[] = {
 
 #if CONFIG_VP10_DECODER
-  {"vp10", VP10_FOURCC, &vpx_codec_vp10_dx},
+  { "vp10", VP10_FOURCC, &vpx_codec_vp10_dx },
 #endif
 };
 
@@ -173,17 +167,14 @@ int get_vpx_decoder_count(void) {
   return sizeof(vpx_decoders) / sizeof(vpx_decoders[0]);
 }
 
-const VpxInterface *get_vpx_decoder_by_index(int i) {
-  return &vpx_decoders[i];
-}
+const VpxInterface *get_vpx_decoder_by_index(int i) { return &vpx_decoders[i]; }
 
 const VpxInterface *get_vpx_decoder_by_name(const char *name) {
   int i;
 
   for (i = 0; i < get_vpx_decoder_count(); ++i) {
-     const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
-     if (strcmp(decoder->name, name) == 0)
-       return decoder;
+    const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
+    if (strcmp(decoder->name, name) == 0) return decoder;
   }
 
   return NULL;
@@ -194,8 +185,7 @@ const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) {
 
   for (i = 0; i < get_vpx_decoder_count(); ++i) {
     const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
-    if (decoder->fourcc == fourcc)
-      return decoder;
+    if (decoder->fourcc == fourcc) return decoder;
   }
 
   return NULL;
@@ -213,7 +203,7 @@ int vpx_img_plane_width(const vpx_image_t *img, int plane) {
 }
 
 int vpx_img_plane_height(const vpx_image_t *img, int plane) {
-  if (plane > 0 &&  img->y_chroma_shift > 0)
+  if (plane > 0 && img->y_chroma_shift > 0)
     return (img->d_h + 1) >> img->y_chroma_shift;
   else
     return img->d_h;
@@ -226,7 +216,7 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
     const int w = vpx_img_plane_width(img, plane) *
-        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
@@ -244,13 +234,12 @@ int vpx_img_read(vpx_image_t *img, FILE *file) {
     unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
     const int w = vpx_img_plane_width(img, plane) *
-        ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
     for (y = 0; y < h; ++y) {
-      if (fread(buf, 1, w, file) != (size_t)w)
-        return 0;
+      if (fread(buf, 1, w, file) != (size_t)w) return 0;
       buf += stride;
     }
   }
@@ -279,19 +268,16 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
   int plane;
   if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
       dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt || input_shift < 0) {
+      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+      input_shift < 0) {
     fatal("Unsupported image conversion");
   }
   switch (src->fmt) {
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
-    case VPX_IMG_FMT_I44016:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
+    case VPX_IMG_FMT_I44016: break;
+    default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -306,8 +292,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
           (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint16_t *p_dst =
           (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
-      for (x = 0; x < w; x++)
-        *p_dst++ = (*p_src++ << input_shift) + offset;
+      for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset;
     }
   }
 }
@@ -320,19 +305,15 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
   if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
       dst->x_chroma_shift != src->x_chroma_shift ||
       dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
-      input_shift < 0) {
+      dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH || input_shift < 0) {
     fatal("Unsupported image conversion");
   }
   switch (src->fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
-    case VPX_IMG_FMT_I440:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
+    case VPX_IMG_FMT_I440: break;
+    default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -353,8 +334,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
   }
 }
 
-void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src,
-                     int input_shift) {
+void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift) {
   if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
     highbd_img_upshift(dst, src, input_shift);
   } else {
@@ -364,9 +344,8 @@ void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src,
 
 void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
   int plane;
-  if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
-      dst->d_w != src->d_w || dst->d_h != src->d_h ||
-      dst->x_chroma_shift != src->x_chroma_shift ||
+  if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w ||
+      dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift ||
       dst->y_chroma_shift != src->y_chroma_shift) {
     fatal("Unsupported image conversion");
   }
@@ -374,11 +353,8 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
-    case VPX_IMG_FMT_I440:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
+    case VPX_IMG_FMT_I440: break;
+    default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -404,19 +380,16 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
   int plane;
   if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
       dst->x_chroma_shift != src->x_chroma_shift ||
-      dst->y_chroma_shift != src->y_chroma_shift ||
-      dst->fmt != src->fmt || down_shift < 0) {
+      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+      down_shift < 0) {
     fatal("Unsupported image conversion");
   }
   switch (src->fmt) {
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
-    case VPX_IMG_FMT_I44016:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
+    case VPX_IMG_FMT_I44016: break;
+    default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -431,8 +404,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
           (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint16_t *p_dst =
           (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
-      for (x = 0; x < w; x++)
-        *p_dst++ = *p_src++ >> down_shift;
+      for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift;
     }
   }
 }
@@ -443,19 +415,15 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
   if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
       dst->x_chroma_shift != src->x_chroma_shift ||
       dst->y_chroma_shift != src->y_chroma_shift ||
-      src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
-      down_shift < 0) {
+      src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH || down_shift < 0) {
     fatal("Unsupported image conversion");
   }
   switch (dst->fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
-    case VPX_IMG_FMT_I440:
-      break;
-    default:
-      fatal("Unsupported image conversion");
-      break;
+    case VPX_IMG_FMT_I440: break;
+    default: fatal("Unsupported image conversion"); break;
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -476,8 +444,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
   }
 }
 
-void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src,
-                       int down_shift) {
+void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift) {
   if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
     highbd_img_downshift(dst, src, down_shift);
   } else {
diff --git a/tools_common.h b/tools_common.h
index 98347b6f27150f6d9fe5f50bfd72ae68e8459fea..ce4fbf8f66e114e9a4a7ff866bc652d5aee9cd2e 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -30,24 +30,24 @@
 /* MinGW uses f{seek,tell}o64 for large files. */
 #define fseeko fseeko64
 #define ftello ftello64
-#endif  /* _WIN32 */
+#endif /* _WIN32 */
 
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
-#include <io.h>  /* NOLINT */
-#define isatty   _isatty
-#define fileno   _fileno
+#include <io.h> /* NOLINT */
+#define isatty _isatty
+#define fileno _fileno
 #else
-#include <unistd.h>  /* NOLINT */
-#endif  /* _MSC_VER */
-#endif  /* CONFIG_OS_SUPPORT */
+#include <unistd.h> /* NOLINT */
+#endif              /* _MSC_VER */
+#endif              /* CONFIG_OS_SUPPORT */
 
 /* Use 32-bit file operations in WebM file format when building ARM
  * executables (.axf) with RVCT. */
 #if !CONFIG_OS_SUPPORT
 #define fseeko fseek
 #define ftello ftell
-#endif  /* CONFIG_OS_SUPPORT */
+#endif /* CONFIG_OS_SUPPORT */
 
 #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
 
@@ -55,7 +55,7 @@
 #define PATH_MAX 512
 #endif
 
-#define IVF_FRAME_HDR_SZ (4 + 8)  /* 4 byte size + 8 byte timestamp */
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
 #define IVF_FILE_HDR_SZ 32
 
 #define RAW_FRAME_HDR_SZ sizeof(uint32_t)
@@ -158,7 +158,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src);
 #endif
 
 #ifdef __cplusplus
-}  /* extern "C" */
+} /* extern "C" */
 #endif
 
 #endif  // TOOLS_COMMON_H_
diff --git a/video_reader.c b/video_reader.c
index 39c7edba1e3deb5f8d3362deeb26ba581f8a5136..a0ba2521c6135f06bff917b920d321eccbcf1f5f 100644
--- a/video_reader.c
+++ b/video_reader.c
@@ -30,21 +30,17 @@ VpxVideoReader *vpx_video_reader_open(const char *filename) {
   char header[32];
   VpxVideoReader *reader = NULL;
   FILE *const file = fopen(filename, "rb");
-  if (!file)
-    return NULL;  // Can't open file
+  if (!file) return NULL;  // Can't open file
 
-  if (fread(header, 1, 32, file) != 32)
-    return NULL;  // Can't read file header
+  if (fread(header, 1, 32, file) != 32) return NULL;  // Can't read file header
 
   if (memcmp(kIVFSignature, header, 4) != 0)
     return NULL;  // Wrong IVF signature
 
-  if (mem_get_le16(header + 4) != 0)
-    return NULL;  // Wrong IVF version
+  if (mem_get_le16(header + 4) != 0) return NULL;  // Wrong IVF version
 
   reader = calloc(1, sizeof(*reader));
-  if (!reader)
-    return NULL;  // Can't allocate VpxVideoReader
+  if (!reader) return NULL;  // Can't allocate VpxVideoReader
 
   reader->file = file;
   reader->info.codec_fourcc = mem_get_le32(header + 8);
@@ -71,8 +67,7 @@ int vpx_video_reader_read_frame(VpxVideoReader *reader) {
 
 const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader,
                                           size_t *size) {
-  if (size)
-    *size = reader->frame_size;
+  if (size) *size = reader->frame_size;
 
   return reader->buffer;
 }
@@ -80,4 +75,3 @@ const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader,
 const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader) {
   return &reader->info;
 }
-
diff --git a/video_reader.h b/video_reader.h
index a62c6d7109ad7d6ddeabc23686194033dd5ef298..73c25b00a7d94740dda8e80ece3148d6f5a018a2 100644
--- a/video_reader.h
+++ b/video_reader.h
@@ -39,8 +39,7 @@ int vpx_video_reader_read_frame(VpxVideoReader *reader);
 
 // Returns the pointer to memory buffer with frame data read by last call to
 // vpx_video_reader_read_frame().
-const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader,
-                                          size_t *size);
+const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader, size_t *size);
 
 // Fills VpxVideoInfo with information from opened video file.
 const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader);
diff --git a/video_writer.c b/video_writer.c
index 3695236bfa5232801598953b99f511dac538a7ad..56d428b0720f7101451ff3b130fa9d47550d9dea 100644
--- a/video_writer.c
+++ b/video_writer.c
@@ -37,12 +37,10 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename,
   if (container == kContainerIVF) {
     VpxVideoWriter *writer = NULL;
     FILE *const file = fopen(filename, "wb");
-    if (!file)
-      return NULL;
+    if (!file) return NULL;
 
     writer = malloc(sizeof(*writer));
-    if (!writer)
-      return NULL;
+    if (!writer) return NULL;
 
     writer->frame_count = 0;
     writer->info = *info;
@@ -67,12 +65,10 @@ void vpx_video_writer_close(VpxVideoWriter *writer) {
   }
 }
 
-int vpx_video_writer_write_frame(VpxVideoWriter *writer,
-                                 const uint8_t *buffer, size_t size,
-                                 int64_t pts) {
+int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer,
+                                 size_t size, int64_t pts) {
   ivf_write_frame_header(writer->file, pts, size);
-  if (fwrite(buffer, 1, size, writer->file) != size)
-    return 0;
+  if (fwrite(buffer, 1, size, writer->file) != size) return 0;
 
   ++writer->frame_count;
 
diff --git a/video_writer.h b/video_writer.h
index 5dbfe52ea00f9c7e8dac94bbe189dd1aa611e69b..a769811c44042c3b96f6e9eea5c8802149facc95 100644
--- a/video_writer.h
+++ b/video_writer.h
@@ -13,9 +13,7 @@
 
 #include "./video_common.h"
 
-typedef enum {
-  kContainerIVF
-} VpxContainer;
+typedef enum { kContainerIVF } VpxContainer;
 
 struct VpxVideoWriterStruct;
 typedef struct VpxVideoWriterStruct VpxVideoWriter;
@@ -36,9 +34,8 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename,
 void vpx_video_writer_close(VpxVideoWriter *writer);
 
 // Writes frame bytes to the file.
-int vpx_video_writer_write_frame(VpxVideoWriter *writer,
-                                 const uint8_t *buffer, size_t size,
-                                 int64_t pts);
+int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer,
+                                 size_t size, int64_t pts);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index c61b836831c18fa990ce21bf0761b6a99673431e..6a253a57e2438238d806ec3c56b661856c2e4574 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 /*!\file
  * \brief Describes the decoder algorithm interface for algorithm
  *        implementations.
@@ -61,7 +60,7 @@ extern "C" {
  */
 #define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
 
-typedef struct vpx_codec_alg_priv  vpx_codec_alg_priv_t;
+typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t;
 typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
 
 /*!\brief init function pointer prototype
@@ -77,8 +76,8 @@ typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory operation failed.
  */
-typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx,
-                                               vpx_codec_priv_enc_mr_cfg_t *data);
+typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(
+    vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data);
 
 /*!\brief destroy function pointer prototype
  *
@@ -112,8 +111,8 @@ typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx);
  * \retval #VPX_CODEC_OK
  *     Bitstream is parsable and stream information updated
  */
-typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t         *data,
-                                                  unsigned int           data_sz,
+typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data,
+                                                  unsigned int data_sz,
                                                   vpx_codec_stream_info_t *si);
 
 /*!\brief Return information about the current stream.
@@ -129,7 +128,7 @@ typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t         *data,
  * \retval #VPX_CODEC_OK
  *     Bitstream is parsable and stream information updated
  */
-typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t    *ctx,
+typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  vpx_codec_stream_info_t *si);
 
 /*!\brief control function pointer prototype
@@ -193,11 +192,11 @@ typedef const struct vpx_codec_ctrl_fn_map {
  *         see the descriptions of the other error codes in ::vpx_codec_err_t
  *         for recoverability capabilities.
  */
-typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t  *ctx,
-                                                 const uint8_t         *data,
-                                                 unsigned int     data_sz,
-                                                 void        *user_priv,
-                                                 long         deadline);
+typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx,
+                                                 const uint8_t *data,
+                                                 unsigned int data_sz,
+                                                 void *user_priv,
+                                                 long deadline);
 
 /*!\brief Decoded frames iterator
  *
@@ -206,7 +205,8 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t  *ctx,
  * complete when this function returns NULL.
  *
  * The list of available frames becomes valid upon completion of the
- * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode.
+ * vpx_codec_decode call, and remains valid until the next call to
+ * vpx_codec_decode.
  *
  * \param[in]     ctx      Pointer to this instance's context
  * \param[in out] iter     Iterator storage, initialized to NULL
@@ -215,7 +215,7 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t  *ctx,
  *         produced will always be in PTS (presentation time stamp) order.
  */
 typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
-                                                 vpx_codec_iter_t     *iter);
+                                                 vpx_codec_iter_t *iter);
 
 /*!\brief Pass in external frame buffers for the decoder to use.
  *
@@ -244,32 +244,28 @@ typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
  * buffers.
  */
 typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)(
-    vpx_codec_alg_priv_t *ctx,
-    vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_codec_alg_priv_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
 
+typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx,
+                                                 const vpx_image_t *img,
+                                                 vpx_codec_pts_t pts,
+                                                 unsigned long duration,
+                                                 vpx_enc_frame_flags_t flags,
+                                                 unsigned long deadline);
+typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(
+    vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter);
 
-typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t  *ctx,
-                                                 const vpx_image_t     *img,
-                                                 vpx_codec_pts_t        pts,
-                                                 unsigned long          duration,
-                                                 vpx_enc_frame_flags_t  flags,
-                                                 unsigned long          deadline);
-typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(vpx_codec_alg_priv_t *ctx,
-                                                                vpx_codec_iter_t     *iter);
-
-typedef vpx_codec_err_t
-(*vpx_codec_enc_config_set_fn_t)(vpx_codec_alg_priv_t       *ctx,
-                                 const vpx_codec_enc_cfg_t  *cfg);
-typedef vpx_fixed_buf_t *
-(*vpx_codec_get_global_headers_fn_t)(vpx_codec_alg_priv_t   *ctx);
+typedef vpx_codec_err_t (*vpx_codec_enc_config_set_fn_t)(
+    vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg);
+typedef vpx_fixed_buf_t *(*vpx_codec_get_global_headers_fn_t)(
+    vpx_codec_alg_priv_t *ctx);
 
-typedef vpx_image_t *
-(*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t   *ctx);
+typedef vpx_image_t *(*vpx_codec_get_preview_frame_fn_t)(
+    vpx_codec_alg_priv_t *ctx);
 
-typedef vpx_codec_err_t
-(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t     *cfg,
-                                     void **mem_loc);
+typedef vpx_codec_err_t (*vpx_codec_enc_mr_get_mem_loc_fn_t)(
+    const vpx_codec_enc_cfg_t *cfg, void **mem_loc);
 
 /*!\brief usage configuration mapping
  *
@@ -282,7 +278,7 @@ typedef vpx_codec_err_t
  *
  */
 typedef const struct vpx_codec_enc_cfg_map {
-  int                 usage;
+  int usage;
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
@@ -291,41 +287,47 @@ typedef const struct vpx_codec_enc_cfg_map {
  * All decoders \ref MUST expose a variable of this type.
  */
 struct vpx_codec_iface {
-  const char               *name;        /**< Identification String  */
-  int                       abi_version; /**< Implemented ABI version */
-  vpx_codec_caps_t          caps;    /**< Decoder capabilities */
-  vpx_codec_init_fn_t       init;    /**< \copydoc ::vpx_codec_init_fn_t */
-  vpx_codec_destroy_fn_t    destroy;     /**< \copydoc ::vpx_codec_destroy_fn_t */
-  vpx_codec_ctrl_fn_map_t  *ctrl_maps;   /**< \copydoc ::vpx_codec_ctrl_fn_map_t */
+  const char *name;                   /**< Identification String  */
+  int abi_version;                    /**< Implemented ABI version */
+  vpx_codec_caps_t caps;              /**< Decoder capabilities */
+  vpx_codec_init_fn_t init;           /**< \copydoc ::vpx_codec_init_fn_t */
+  vpx_codec_destroy_fn_t destroy;     /**< \copydoc ::vpx_codec_destroy_fn_t */
+  vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */
   struct vpx_codec_dec_iface {
-    vpx_codec_peek_si_fn_t    peek_si;     /**< \copydoc ::vpx_codec_peek_si_fn_t */
-    vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_get_si_fn_t */
-    vpx_codec_decode_fn_t     decode;      /**< \copydoc ::vpx_codec_decode_fn_t */
-    vpx_codec_get_frame_fn_t  get_frame;   /**< \copydoc ::vpx_codec_get_frame_fn_t */
-    vpx_codec_set_fb_fn_t     set_fb_fn;   /**< \copydoc ::vpx_codec_set_fb_fn_t */
+    vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */
+    vpx_codec_get_si_fn_t get_si;   /**< \copydoc ::vpx_codec_get_si_fn_t */
+    vpx_codec_decode_fn_t decode;   /**< \copydoc ::vpx_codec_decode_fn_t */
+    vpx_codec_get_frame_fn_t
+        get_frame;                   /**< \copydoc ::vpx_codec_get_frame_fn_t */
+    vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */
   } dec;
   struct vpx_codec_enc_iface {
-    int                                cfg_map_count;
-    vpx_codec_enc_cfg_map_t           *cfg_maps;      /**< \copydoc ::vpx_codec_enc_cfg_map_t */
-    vpx_codec_encode_fn_t              encode;        /**< \copydoc ::vpx_codec_encode_fn_t */
-    vpx_codec_get_cx_data_fn_t         get_cx_data;   /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
-    vpx_codec_enc_config_set_fn_t      cfg_set;       /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
-    vpx_codec_get_global_headers_fn_t  get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */
-    vpx_codec_get_preview_frame_fn_t   get_preview;   /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */
-    vpx_codec_enc_mr_get_mem_loc_fn_t  mr_get_mem_loc;   /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */
+    int cfg_map_count;
+    vpx_codec_enc_cfg_map_t
+        *cfg_maps;                /**< \copydoc ::vpx_codec_enc_cfg_map_t */
+    vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */
+    vpx_codec_get_cx_data_fn_t
+        get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
+    vpx_codec_enc_config_set_fn_t
+        cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
+    vpx_codec_get_global_headers_fn_t
+        get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */
+    vpx_codec_get_preview_frame_fn_t
+        get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */
+    vpx_codec_enc_mr_get_mem_loc_fn_t
+        mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */
   } enc;
 };
 
 /*!\brief Callback function pointer / user data pair storage */
 typedef struct vpx_codec_priv_cb_pair {
   union {
-    vpx_codec_put_frame_cb_fn_t    put_frame;
-    vpx_codec_put_slice_cb_fn_t    put_slice;
+    vpx_codec_put_frame_cb_fn_t put_frame;
+    vpx_codec_put_slice_cb_fn_t put_slice;
   } u;
-  void                            *user_priv;
+  void *user_priv;
 } vpx_codec_priv_cb_pair_t;
 
-
 /*!\brief Instance private storage
  *
  * This structure is allocated by the algorithm's init function. It can be
@@ -335,39 +337,38 @@ typedef struct vpx_codec_priv_cb_pair {
  * and the pointer cast to the proper type.
  */
 struct vpx_codec_priv {
-  const char                     *err_detail;
-  vpx_codec_flags_t               init_flags;
+  const char *err_detail;
+  vpx_codec_flags_t init_flags;
   struct {
-    vpx_codec_priv_cb_pair_t    put_frame_cb;
-    vpx_codec_priv_cb_pair_t    put_slice_cb;
+    vpx_codec_priv_cb_pair_t put_frame_cb;
+    vpx_codec_priv_cb_pair_t put_slice_cb;
   } dec;
   struct {
-    vpx_fixed_buf_t             cx_data_dst_buf;
-    unsigned int                cx_data_pad_before;
-    unsigned int                cx_data_pad_after;
-    vpx_codec_cx_pkt_t          cx_data_pkt;
-    unsigned int                total_encoders;
+    vpx_fixed_buf_t cx_data_dst_buf;
+    unsigned int cx_data_pad_before;
+    unsigned int cx_data_pad_after;
+    vpx_codec_cx_pkt_t cx_data_pkt;
+    unsigned int total_encoders;
   } enc;
 };
 
 /*
  * Multi-resolution encoding internal configuration
  */
-struct vpx_codec_priv_enc_mr_cfg
-{
-    unsigned int           mr_total_resolutions;
-    unsigned int           mr_encoder_id;
-    struct vpx_rational    mr_down_sampling_factor;
-    void*                  mr_low_res_mode_info;
+struct vpx_codec_priv_enc_mr_cfg {
+  unsigned int mr_total_resolutions;
+  unsigned int mr_encoder_id;
+  struct vpx_rational mr_down_sampling_factor;
+  void *mr_low_res_mode_info;
 };
 
 #undef VPX_CTRL_USE_TYPE
 #define VPX_CTRL_USE_TYPE(id, typ) \
-  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
+  static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
 
 #undef VPX_CTRL_USE_TYPE_DEPRECATED
 #define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
-  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
+  static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
 
 #define CAST(id, arg) id##__value(arg)
 
@@ -380,10 +381,9 @@ struct vpx_codec_priv_enc_mr_cfg
  * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
  * macro is provided to define this getter function automatically.
  */
-#define CODEC_INTERFACE(id)\
-  vpx_codec_iface_t* id(void) { return &id##_algo; }\
-  vpx_codec_iface_t  id##_algo
-
+#define CODEC_INTERFACE(id)                          \
+  vpx_codec_iface_t *id(void) { return &id##_algo; } \
+  vpx_codec_iface_t id##_algo
 
 /* Internal Utility Functions
  *
@@ -391,38 +391,39 @@ struct vpx_codec_priv_enc_mr_cfg
  * utilities for manipulating vpx_codec_* data structures.
  */
 struct vpx_codec_pkt_list {
-  unsigned int            cnt;
-  unsigned int            max;
+  unsigned int cnt;
+  unsigned int max;
   struct vpx_codec_cx_pkt pkts[1];
 };
 
-#define vpx_codec_pkt_list_decl(n)\
-  union {struct vpx_codec_pkt_list head;\
-    struct {struct vpx_codec_pkt_list head;\
-      struct vpx_codec_cx_pkt    pkts[n];} alloc;}
-
-#define vpx_codec_pkt_list_init(m)\
-  (m)->alloc.head.cnt = 0,\
-                        (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
+#define vpx_codec_pkt_list_decl(n)     \
+  union {                              \
+    struct vpx_codec_pkt_list head;    \
+    struct {                           \
+      struct vpx_codec_pkt_list head;  \
+      struct vpx_codec_cx_pkt pkts[n]; \
+    } alloc;                           \
+  }
 
-int
-vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *,
-                       const struct vpx_codec_cx_pkt *);
+#define vpx_codec_pkt_list_init(m) \
+  (m)->alloc.head.cnt = 0,         \
+  (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
 
-const vpx_codec_cx_pkt_t *
-vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list,
-                       vpx_codec_iter_t           *iter);
+int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *,
+                           const struct vpx_codec_cx_pkt *);
 
+const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(
+    struct vpx_codec_pkt_list *list, vpx_codec_iter_t *iter);
 
 #include <stdio.h>
 #include <setjmp.h>
 
 struct vpx_internal_error_info {
-  vpx_codec_err_t  error_code;
-  int              has_detail;
-  char             detail[80];
-  int              setjmp;
-  jmp_buf          jmp;
+  vpx_codec_err_t error_code;
+  int has_detail;
+  char detail[80];
+  int setjmp;
+  jmp_buf jmp;
 };
 
 #define CLANG_ANALYZER_NORETURN
@@ -434,8 +435,7 @@ struct vpx_internal_error_info {
 #endif
 
 void vpx_internal_error(struct vpx_internal_error_info *info,
-                        vpx_codec_err_t                 error,
-                        const char                     *fmt,
+                        vpx_codec_err_t error, const char *fmt,
                         ...) CLANG_ANALYZER_NORETURN;
 
 #ifdef __cplusplus
diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index 5a495ce814b814fe3fef105fb76894e754fda420..f222b9e5cb6f5073f551fec9f332bca7d01a2793 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 /*!\file
  * \brief Provides the high level interface to wrap decoder algorithms.
  *
@@ -19,67 +18,50 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
 
-#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
-
-int vpx_codec_version(void) {
-  return VERSION_PACKED;
-}
-
-
-const char *vpx_codec_version_str(void) {
-  return VERSION_STRING_NOSP;
-}
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
 
+int vpx_codec_version(void) { return VERSION_PACKED; }
 
-const char *vpx_codec_version_extra_str(void) {
-  return VERSION_EXTRA;
-}
+const char *vpx_codec_version_str(void) { return VERSION_STRING_NOSP; }
 
+const char *vpx_codec_version_extra_str(void) { return VERSION_EXTRA; }
 
 const char *vpx_codec_iface_name(vpx_codec_iface_t *iface) {
   return iface ? iface->name : "<invalid interface>";
 }
 
-const char *vpx_codec_err_to_string(vpx_codec_err_t  err) {
+const char *vpx_codec_err_to_string(vpx_codec_err_t err) {
   switch (err) {
-    case VPX_CODEC_OK:
-      return "Success";
-    case VPX_CODEC_ERROR:
-      return "Unspecified internal error";
-    case VPX_CODEC_MEM_ERROR:
-      return "Memory allocation error";
-    case VPX_CODEC_ABI_MISMATCH:
-      return "ABI version mismatch";
+    case VPX_CODEC_OK: return "Success";
+    case VPX_CODEC_ERROR: return "Unspecified internal error";
+    case VPX_CODEC_MEM_ERROR: return "Memory allocation error";
+    case VPX_CODEC_ABI_MISMATCH: return "ABI version mismatch";
     case VPX_CODEC_INCAPABLE:
       return "Codec does not implement requested capability";
     case VPX_CODEC_UNSUP_BITSTREAM:
       return "Bitstream not supported by this decoder";
     case VPX_CODEC_UNSUP_FEATURE:
       return "Bitstream required feature not supported by this decoder";
-    case VPX_CODEC_CORRUPT_FRAME:
-      return "Corrupt frame detected";
-    case  VPX_CODEC_INVALID_PARAM:
-      return "Invalid parameter";
-    case VPX_CODEC_LIST_END:
-      return "End of iterated list";
+    case VPX_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
+    case VPX_CODEC_INVALID_PARAM: return "Invalid parameter";
+    case VPX_CODEC_LIST_END: return "End of iterated list";
   }
 
   return "Unrecognized error code";
 }
 
-const char *vpx_codec_error(vpx_codec_ctx_t  *ctx) {
+const char *vpx_codec_error(vpx_codec_ctx_t *ctx) {
   return (ctx) ? vpx_codec_err_to_string(ctx->err)
-         : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM);
+               : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM);
 }
 
-const char *vpx_codec_error_detail(vpx_codec_ctx_t  *ctx) {
+const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) {
   if (ctx && ctx->err)
     return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
 
   return NULL;
 }
 
-
 vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
   vpx_codec_err_t res;
 
@@ -99,15 +81,11 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
   return SAVE_STATUS(ctx, res);
 }
 
-
 vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) {
   return (iface) ? iface->caps : 0;
 }
 
-
-vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
-                                   int               ctrl_id,
-                                   ...) {
+vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) {
   vpx_codec_err_t res;
 
   if (!ctx || !ctrl_id)
@@ -121,7 +99,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
 
     for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
       if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
-        va_list  ap;
+        va_list ap;
 
         va_start(ap, ctrl_id);
         res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap);
@@ -135,16 +113,14 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
 }
 
 void vpx_internal_error(struct vpx_internal_error_info *info,
-                        vpx_codec_err_t                 error,
-                        const char                     *fmt,
-                        ...) {
+                        vpx_codec_err_t error, const char *fmt, ...) {
   va_list ap;
 
   info->error_code = error;
   info->has_detail = 0;
 
   if (fmt) {
-    size_t  sz = sizeof(info->detail);
+    size_t sz = sizeof(info->detail);
 
     info->has_detail = 1;
     va_start(ap, fmt);
@@ -153,6 +129,5 @@ void vpx_internal_error(struct vpx_internal_error_info *info,
     info->detail[sz - 1] = '\0';
   }
 
-  if (info->setjmp)
-    longjmp(info->jmp, info->error_code);
+  if (info->setjmp) longjmp(info->jmp, info->error_code);
 }
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 802d8edd8a437a3e424941adb3273485ff00b23d..fc1c2bccae77f5067c26bd44af66e316d8908fbd 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 /*!\file
  * \brief Provides the high level interface to wrap decoder algorithms.
  *
@@ -16,17 +15,16 @@
 #include <string.h>
 #include "vpx/internal/vpx_codec_internal.h"
 
-#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
 
 static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
   return (vpx_codec_alg_priv_t *)ctx->priv;
 }
 
-vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
-                                       vpx_codec_iface_t    *iface,
+vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
+                                       vpx_codec_iface_t *iface,
                                        const vpx_codec_dec_cfg_t *cfg,
-                                       vpx_codec_flags_t     flags,
-                                       int                   ver) {
+                                       vpx_codec_flags_t flags, int ver) {
   vpx_codec_err_t res;
 
   if (ver != VPX_DECODER_ABI_VERSION)
@@ -35,7 +33,8 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
     res = VPX_CODEC_INVALID_PARAM;
   else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
     res = VPX_CODEC_ABI_MISMATCH;
-  else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC))
+  else if ((flags & VPX_CODEC_USE_POSTPROC) &&
+           !(iface->caps & VPX_CODEC_CAP_POSTPROC))
     res = VPX_CODEC_INCAPABLE;
   else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) &&
            !(iface->caps & VPX_CODEC_CAP_ERROR_CONCEALMENT))
@@ -63,15 +62,14 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-
-vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t       *iface,
-                                           const uint8_t         *data,
-                                           unsigned int           data_sz,
+vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface,
+                                           const uint8_t *data,
+                                           unsigned int data_sz,
                                            vpx_codec_stream_info_t *si) {
   vpx_codec_err_t res;
 
-  if (!iface || !data || !data_sz || !si
-      || si->sz < sizeof(vpx_codec_stream_info_t))
+  if (!iface || !data || !data_sz || !si ||
+      si->sz < sizeof(vpx_codec_stream_info_t))
     res = VPX_CODEC_INVALID_PARAM;
   else {
     /* Set default/unknown values */
@@ -84,8 +82,7 @@ vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t       *iface,
   return res;
 }
 
-
-vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t         *ctx,
+vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
                                           vpx_codec_stream_info_t *si) {
   vpx_codec_err_t res;
 
@@ -104,12 +101,9 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t         *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-
-vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
-                                 const uint8_t        *data,
-                                 unsigned int    data_sz,
-                                 void       *user_priv,
-                                 long        deadline) {
+vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
+                                 unsigned int data_sz, void *user_priv,
+                                 long deadline) {
   vpx_codec_err_t res;
 
   /* Sanity checks */
@@ -126,8 +120,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t  *ctx,
-                                 vpx_codec_iter_t *iter) {
+vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter) {
   vpx_image_t *img;
 
   if (!ctx || !iter || !ctx->iface || !ctx->priv)
@@ -138,16 +131,15 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t  *ctx,
   return img;
 }
 
-
-vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t             *ctx,
-                                                vpx_codec_put_frame_cb_fn_t  cb,
-                                                void                      *user_priv) {
+vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
+                                                vpx_codec_put_frame_cb_fn_t cb,
+                                                void *user_priv) {
   vpx_codec_err_t res;
 
   if (!ctx || !cb)
     res = VPX_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv
-           || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
+  else if (!ctx->iface || !ctx->priv ||
+           !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
     res = VPX_CODEC_ERROR;
   else {
     ctx->priv->dec.put_frame_cb.u.put_frame = cb;
@@ -158,16 +150,15 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t             *ctx
   return SAVE_STATUS(ctx, res);
 }
 
-
-vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t             *ctx,
-                                                vpx_codec_put_slice_cb_fn_t  cb,
-                                                void                      *user_priv) {
+vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
+                                                vpx_codec_put_slice_cb_fn_t cb,
+                                                void *user_priv) {
   vpx_codec_err_t res;
 
   if (!ctx || !cb)
     res = VPX_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv
-           || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
+  else if (!ctx->iface || !ctx->priv ||
+           !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
     res = VPX_CODEC_ERROR;
   else {
     ctx->priv->dec.put_slice_cb.u.put_slice = cb;
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index cd10c411ceaf5ec0e33e58da7bbbf61c4cc2f08e..4390cf7c8f1f5ecf8dd356b97a722a384f071113 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 /*!\file
  * \brief Provides the high level interface to wrap encoder algorithms.
  *
@@ -18,17 +17,16 @@
 #include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
-#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
 
 static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
   return (vpx_codec_alg_priv_t *)ctx->priv;
 }
 
-vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
-                                       vpx_codec_iface_t    *iface,
+vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
+                                       vpx_codec_iface_t *iface,
                                        const vpx_codec_enc_cfg_t *cfg,
-                                       vpx_codec_flags_t     flags,
-                                       int                   ver) {
+                                       vpx_codec_flags_t flags, int ver) {
   vpx_codec_err_t res;
 
   if (ver != VPX_ENCODER_ABI_VERSION)
@@ -39,11 +37,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
     res = VPX_CODEC_ABI_MISMATCH;
   else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
-  else if ((flags & VPX_CODEC_USE_PSNR)
-           && !(iface->caps & VPX_CODEC_CAP_PSNR))
+  else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR))
     res = VPX_CODEC_INCAPABLE;
-  else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION)
-           && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
+  else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) &&
+           !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
     res = VPX_CODEC_INCAPABLE;
   else {
     ctx->iface = iface;
@@ -62,13 +59,9 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
-                                             vpx_codec_iface_t    *iface,
-                                             vpx_codec_enc_cfg_t  *cfg,
-                                             int                   num_enc,
-                                             vpx_codec_flags_t     flags,
-                                             vpx_rational_t       *dsf,
-                                             int                   ver) {
+vpx_codec_err_t vpx_codec_enc_init_multi_ver(
+    vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg,
+    int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (ver != VPX_ENCODER_ABI_VERSION)
@@ -79,11 +72,10 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
     res = VPX_CODEC_ABI_MISMATCH;
   else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
-  else if ((flags & VPX_CODEC_USE_PSNR)
-           && !(iface->caps & VPX_CODEC_CAP_PSNR))
+  else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR))
     res = VPX_CODEC_INCAPABLE;
-  else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION)
-           && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
+  else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) &&
+           !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION))
     res = VPX_CODEC_INCAPABLE;
   else {
     int i;
@@ -110,8 +102,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
          * resolution always use the same frame_type chosen by the
          * lowest-resolution encoder.
          */
-        if (mr_cfg.mr_encoder_id)
-          cfg->kf_mode = VPX_KF_DISABLED;
+        if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
 
         ctx->iface = iface;
         ctx->name = iface->name;
@@ -121,8 +112,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
         res = ctx->iface->init(ctx, &mr_cfg);
 
         if (res) {
-          const char *error_detail =
-            ctx->priv ? ctx->priv->err_detail : NULL;
+          const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
           /* Destroy current ctx */
           ctx->err_detail = error_detail;
           vpx_codec_destroy(ctx);
@@ -136,8 +126,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
           }
         }
 
-        if (res)
-          break;
+        if (res) break;
 
         ctx++;
         cfg++;
@@ -150,10 +139,9 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-
-vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
-                                              vpx_codec_enc_cfg_t  *cfg,
-                                              unsigned int          usage) {
+vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
+                                             vpx_codec_enc_cfg_t *cfg,
+                                             unsigned int usage) {
   vpx_codec_err_t res;
   vpx_codec_enc_cfg_map_t *map;
   int i;
@@ -179,30 +167,28 @@ vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
   return res;
 }
 
-
 #if ARCH_X86 || ARCH_X86_64
 /* On X86, disable the x87 unit's internal 80 bit precision for better
  * consistency with the SSE unit's 64 bit precision.
  */
 #include "vpx_ports/x86.h"
-#define FLOATING_POINT_INIT() do {\
+#define FLOATING_POINT_INIT() \
+  do {                        \
     unsigned short x87_orig_mode = x87_set_double_precision();
-#define FLOATING_POINT_RESTORE() \
-  x87_set_control_word(x87_orig_mode); }while(0)
-
+#define FLOATING_POINT_RESTORE()       \
+  x87_set_control_word(x87_orig_mode); \
+  }                                    \
+  while (0)
 
 #else
 static void FLOATING_POINT_INIT() {}
 static void FLOATING_POINT_RESTORE() {}
 #endif
 
-
-vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
-                                  const vpx_image_t          *img,
-                                  vpx_codec_pts_t             pts,
-                                  unsigned long               duration,
-                                  vpx_enc_frame_flags_t       flags,
-                                  unsigned long               deadline) {
+vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
+                                 vpx_codec_pts_t pts, unsigned long duration,
+                                 vpx_enc_frame_flags_t flags,
+                                 unsigned long deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (!ctx || (img && !duration))
@@ -220,8 +206,8 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
     FLOATING_POINT_INIT();
 
     if (num_enc == 1)
-      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
-                                   duration, flags, deadline);
+      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags,
+                                   deadline);
     else {
       /* Multi-resolution encoding:
        * Encode multi-levels in reverse order. For example,
@@ -234,8 +220,8 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
       if (img) img += num_enc - 1;
 
       for (i = num_enc - 1; i >= 0; i--) {
-        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
-                                          duration, flags, deadline)))
+        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration,
+                                          flags, deadline)))
           break;
 
         ctx--;
@@ -250,7 +236,6 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-
 const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
                                                 vpx_codec_iter_t *iter) {
   const vpx_codec_cx_pkt_t *pkt = NULL;
@@ -273,18 +258,18 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
     vpx_codec_priv_t *const priv = ctx->priv;
     char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
 
-    if (dst_buf &&
-        pkt->data.raw.buf != dst_buf &&
+    if (dst_buf && pkt->data.raw.buf != dst_buf &&
         pkt->data.raw.sz + priv->enc.cx_data_pad_before +
-            priv->enc.cx_data_pad_after <= priv->enc.cx_data_dst_buf.sz) {
+                priv->enc.cx_data_pad_after <=
+            priv->enc.cx_data_dst_buf.sz) {
       vpx_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
 
       memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
              pkt->data.raw.sz);
       *modified_pkt = *pkt;
       modified_pkt->data.raw.buf = dst_buf;
-      modified_pkt->data.raw.sz += priv->enc.cx_data_pad_before +
-                                       priv->enc.cx_data_pad_after;
+      modified_pkt->data.raw.sz +=
+          priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after;
       pkt = modified_pkt;
     }
 
@@ -297,13 +282,11 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
   return pkt;
 }
 
-
-vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t       *ctx,
+vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
                                           const vpx_fixed_buf_t *buf,
-                                          unsigned int           pad_before,
-                                          unsigned int           pad_after) {
-  if (!ctx || !ctx->priv)
-    return VPX_CODEC_INVALID_PARAM;
+                                          unsigned int pad_before,
+                                          unsigned int pad_after) {
+  if (!ctx || !ctx->priv) return VPX_CODEC_INVALID_PARAM;
 
   if (buf) {
     ctx->priv->enc.cx_data_dst_buf = *buf;
@@ -319,8 +302,7 @@ vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t       *ctx,
   return VPX_CODEC_OK;
 }
 
-
-const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t   *ctx) {
+const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) {
   vpx_image_t *img = NULL;
 
   if (ctx) {
@@ -337,8 +319,7 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t   *ctx) {
   return img;
 }
 
-
-vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t   *ctx) {
+vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) {
   vpx_fixed_buf_t *buf = NULL;
 
   if (ctx) {
@@ -355,9 +336,8 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t   *ctx) {
   return buf;
 }
 
-
-vpx_codec_err_t  vpx_codec_enc_config_set(vpx_codec_ctx_t            *ctx,
-                                          const vpx_codec_enc_cfg_t  *cfg) {
+vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
+                                         const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
 
   if (!ctx || !ctx->iface || !ctx->priv || !cfg)
@@ -370,7 +350,6 @@ vpx_codec_err_t  vpx_codec_enc_config_set(vpx_codec_ctx_t            *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-
 int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *list,
                            const struct vpx_codec_cx_pkt *pkt) {
   if (list->cnt < list->max) {
@@ -381,9 +360,8 @@ int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *list,
   return 1;
 }
 
-
-const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list,
-                                                 vpx_codec_iter_t           *iter) {
+const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(
+    struct vpx_codec_pkt_list *list, vpx_codec_iter_t *iter) {
   const vpx_codec_cx_pkt_t *pkt;
 
   if (!(*iter)) {
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 9aae12c794ba225ed78a74eb6c36108c431910ef..dba439c10a8490f82e1a7dc03f2f90f7f66309ca 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -15,10 +15,8 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
-static vpx_image_t *img_alloc_helper(vpx_image_t *img,
-                                     vpx_img_fmt_t fmt,
-                                     unsigned int d_w,
-                                     unsigned int d_h,
+static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
+                                     unsigned int d_w, unsigned int d_h,
                                      unsigned int buf_align,
                                      unsigned int stride_align,
                                      unsigned char *img_data) {
@@ -27,68 +25,44 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
   int align;
 
   /* Treat align==0 like align==1 */
-  if (!buf_align)
-    buf_align = 1;
+  if (!buf_align) buf_align = 1;
 
   /* Validate alignment (must be power of 2) */
-  if (buf_align & (buf_align - 1))
-    goto fail;
+  if (buf_align & (buf_align - 1)) goto fail;
 
   /* Treat align==0 like align==1 */
-  if (!stride_align)
-    stride_align = 1;
+  if (!stride_align) stride_align = 1;
 
   /* Validate alignment (must be power of 2) */
-  if (stride_align & (stride_align - 1))
-    goto fail;
+  if (stride_align & (stride_align - 1)) goto fail;
 
   /* Get sample size for this format */
   switch (fmt) {
     case VPX_IMG_FMT_RGB32:
     case VPX_IMG_FMT_RGB32_LE:
     case VPX_IMG_FMT_ARGB:
-    case VPX_IMG_FMT_ARGB_LE:
-      bps = 32;
-      break;
+    case VPX_IMG_FMT_ARGB_LE: bps = 32; break;
     case VPX_IMG_FMT_RGB24:
-    case VPX_IMG_FMT_BGR24:
-      bps = 24;
-      break;
+    case VPX_IMG_FMT_BGR24: bps = 24; break;
     case VPX_IMG_FMT_RGB565:
     case VPX_IMG_FMT_RGB565_LE:
     case VPX_IMG_FMT_RGB555:
     case VPX_IMG_FMT_RGB555_LE:
     case VPX_IMG_FMT_UYVY:
     case VPX_IMG_FMT_YUY2:
-    case VPX_IMG_FMT_YVYU:
-      bps = 16;
-      break;
+    case VPX_IMG_FMT_YVYU: bps = 16; break;
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12:
-      bps = 12;
-      break;
+    case VPX_IMG_FMT_VPXYV12: bps = 12; break;
     case VPX_IMG_FMT_I422:
-    case VPX_IMG_FMT_I440:
-      bps = 16;
-      break;
-    case VPX_IMG_FMT_I444:
-      bps = 24;
-      break;
-    case VPX_IMG_FMT_I42016:
-      bps = 24;
-      break;
+    case VPX_IMG_FMT_I440: bps = 16; break;
+    case VPX_IMG_FMT_I444: bps = 24; break;
+    case VPX_IMG_FMT_I42016: bps = 24; break;
     case VPX_IMG_FMT_I42216:
-    case VPX_IMG_FMT_I44016:
-      bps = 32;
-      break;
-    case VPX_IMG_FMT_I44416:
-      bps = 48;
-      break;
-    default:
-      bps = 16;
-      break;
+    case VPX_IMG_FMT_I44016: bps = 32; break;
+    case VPX_IMG_FMT_I44416: bps = 48; break;
+    default: bps = 16; break;
   }
 
   /* Get chroma shift values for this format */
@@ -99,12 +73,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
     case VPX_IMG_FMT_VPXYV12:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I42016:
-    case VPX_IMG_FMT_I42216:
-      xcs = 1;
-      break;
-    default:
-      xcs = 0;
-      break;
+    case VPX_IMG_FMT_I42216: xcs = 1; break;
+    default: xcs = 0; break;
   }
 
   switch (fmt) {
@@ -114,12 +84,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
     case VPX_IMG_FMT_VPXI420:
     case VPX_IMG_FMT_VPXYV12:
     case VPX_IMG_FMT_I42016:
-    case VPX_IMG_FMT_I44016:
-      ycs = 1;
-      break;
-    default:
-      ycs = 0;
-      break;
+    case VPX_IMG_FMT_I44016: ycs = 1; break;
+    default: ycs = 0; break;
   }
 
   /* Calculate storage sizes given the chroma subsampling */
@@ -135,8 +101,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
   if (!img) {
     img = (vpx_image_t *)calloc(1, sizeof(vpx_image_t));
 
-    if (!img)
-      goto fail;
+    if (!img) goto fail;
 
     img->self_allocd = 1;
   } else {
@@ -146,18 +111,17 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
   img->img_data = img_data;
 
   if (!img_data) {
-    const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ?
-                                (uint64_t)h * s * bps / 8 : (uint64_t)h * s;
+    const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR)
+                                    ? (uint64_t)h * s * bps / 8
+                                    : (uint64_t)h * s;
 
-    if (alloc_size != (size_t)alloc_size)
-      goto fail;
+    if (alloc_size != (size_t)alloc_size) goto fail;
 
     img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size);
     img->img_data_owner = 1;
   }
 
-  if (!img->img_data)
-    goto fail;
+  if (!img->img_data) goto fail;
 
   img->fmt = fmt;
   img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
@@ -172,39 +136,30 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img,
   img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs;
 
   /* Default viewport to entire image */
-  if (!vpx_img_set_rect(img, 0, 0, d_w, d_h))
-    return img;
+  if (!vpx_img_set_rect(img, 0, 0, d_w, d_h)) return img;
 
 fail:
   vpx_img_free(img);
   return NULL;
 }
 
-vpx_image_t *vpx_img_alloc(vpx_image_t  *img,
-                           vpx_img_fmt_t fmt,
-                           unsigned int  d_w,
-                           unsigned int  d_h,
-                           unsigned int  align) {
+vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
+                           unsigned int d_w, unsigned int d_h,
+                           unsigned int align) {
   return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
 }
 
-vpx_image_t *vpx_img_wrap(vpx_image_t  *img,
-                          vpx_img_fmt_t fmt,
-                          unsigned int  d_w,
-                          unsigned int  d_h,
-                          unsigned int  stride_align,
-                          unsigned char       *img_data) {
+vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
+                          unsigned int d_h, unsigned int stride_align,
+                          unsigned char *img_data) {
   /* By setting buf_align = 1, we don't change buffer alignment in this
    * function. */
   return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
 }
 
-int vpx_img_set_rect(vpx_image_t  *img,
-                     unsigned int  x,
-                     unsigned int  y,
-                     unsigned int  w,
-                     unsigned int  h) {
-  unsigned char      *data;
+int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
+                     unsigned int w, unsigned int h) {
+  unsigned char *data;
 
   if (x + w <= img->w && y + h <= img->h) {
     img->d_w = w;
@@ -213,7 +168,7 @@ int vpx_img_set_rect(vpx_image_t  *img,
     /* Calculate plane pointers */
     if (!(img->fmt & VPX_IMG_FMT_PLANAR)) {
       img->planes[VPX_PLANE_PACKED] =
-        img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED];
+          img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED];
     } else {
       const int bytes_per_sample =
           (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
@@ -225,8 +180,8 @@ int vpx_img_set_rect(vpx_image_t  *img,
         data += img->h * img->stride[VPX_PLANE_ALPHA];
       }
 
-      img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample +
-          y * img->stride[VPX_PLANE_Y];
+      img->planes[VPX_PLANE_Y] =
+          data + x * bytes_per_sample + y * img->stride[VPX_PLANE_Y];
       data += img->h * img->stride[VPX_PLANE_Y];
 
       if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
@@ -262,24 +217,23 @@ void vpx_img_flip(vpx_image_t *img) {
   img->planes[VPX_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_Y];
   img->stride[VPX_PLANE_Y] = -img->stride[VPX_PLANE_Y];
 
-  img->planes[VPX_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1)
-                              * img->stride[VPX_PLANE_U];
+  img->planes[VPX_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[VPX_PLANE_U];
   img->stride[VPX_PLANE_U] = -img->stride[VPX_PLANE_U];
 
-  img->planes[VPX_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1)
-                              * img->stride[VPX_PLANE_V];
+  img->planes[VPX_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[VPX_PLANE_V];
   img->stride[VPX_PLANE_V] = -img->stride[VPX_PLANE_V];
 
-  img->planes[VPX_PLANE_ALPHA] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_ALPHA];
+  img->planes[VPX_PLANE_ALPHA] +=
+      (signed)(img->d_h - 1) * img->stride[VPX_PLANE_ALPHA];
   img->stride[VPX_PLANE_ALPHA] = -img->stride[VPX_PLANE_ALPHA];
 }
 
 void vpx_img_free(vpx_image_t *img) {
   if (img) {
-    if (img->img_data && img->img_data_owner)
-      vpx_free(img->img_data);
+    if (img->img_data && img->img_data_owner) vpx_free(img->img_data);
 
-    if (img->self_allocd)
-      free(img);
+    if (img->self_allocd) free(img);
   }
 }
diff --git a/vpx/vp8.h b/vpx/vp8.h
index ba67c38366516daff3f91e2660c2cbc6e6bb752b..e27b705a9b76f8e4c59098224babbc7db0fb0568 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -42,24 +42,27 @@ extern "C" {
  * The set of macros define the control functions of VP8 interface
  */
 enum vp8_com_control_id {
-  VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
-  VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
-  VP8_SET_POSTPROC            = 3,    /**< set the decoder's post processing settings  */
-  VP8_SET_DBG_COLOR_REF_FRAME = 4,    /**< set the reference frames to color for each macroblock */
-  VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
-  VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
-  VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
+  /*!\brief pass in an external frame into decoder to be used as reference frame
+   */
+  VP8_SET_REFERENCE = 1,
+  VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
+  VP8_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
+  VP8_SET_DBG_COLOR_REF_FRAME =
+      4, /**< set the reference frames to color for each macroblock */
+  VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
+  VP8_SET_DBG_COLOR_B_MODES = 6,  /**< set which blocks modes to color */
+  VP8_SET_DBG_DISPLAY_MV = 7,     /**< set which motion vector modes to draw */
 
   /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
    * for its control ids. These should be migrated to something like the
    * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
    */
-  VP9_GET_REFERENCE           = 128,  /**< get a pointer to a reference frame */
+  VP9_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
   VP8_COMMON_CTRL_ID_MAX,
 
-  VP10_GET_NEW_FRAME_IMAGE    = 192,  /**< get a pointer to the new frame */
+  VP10_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
 
-  VP8_DECODER_CTRL_ID_START   = 256
+  VP8_DECODER_CTRL_ID_START = 256
 };
 
 /*!\brief post process flags
@@ -67,15 +70,16 @@ enum vp8_com_control_id {
  * The set of macros define VP8 decoder post processing flags
  */
 enum vp8_postproc_level {
-  VP8_NOFILTERING             = 0,
-  VP8_DEBLOCK                 = 1 << 0,
-  VP8_DEMACROBLOCK            = 1 << 1,
-  VP8_ADDNOISE                = 1 << 2,
-  VP8_DEBUG_TXT_FRAME_INFO    = 1 << 3, /**< print frame information */
-  VP8_DEBUG_TXT_MBLK_MODES    = 1 << 4, /**< print macro block modes over each macro block */
-  VP8_DEBUG_TXT_DC_DIFF       = 1 << 5, /**< print dc diff for each macro block */
-  VP8_DEBUG_TXT_RATE_INFO     = 1 << 6, /**< print video rate info (encoder only) */
-  VP8_MFQE                    = 1 << 10
+  VP8_NOFILTERING = 0,
+  VP8_DEBLOCK = 1 << 0,
+  VP8_DEMACROBLOCK = 1 << 1,
+  VP8_ADDNOISE = 1 << 2,
+  VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
+  VP8_DEBUG_TXT_MBLK_MODES =
+      1 << 4, /**< print macro block modes over each macro block */
+  VP8_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
+  VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
+  VP8_MFQE = 1 << 10
 };
 
 /*!\brief post process flags
@@ -86,9 +90,11 @@ enum vp8_postproc_level {
  */
 
 typedef struct vp8_postproc_cfg {
-  int post_proc_flag;         /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */
-  int deblocking_level;       /**< the strength of deblocking, valid range [0, 16] */
-  int noise_level;            /**< the strength of additive noise, valid range [0, 16] */
+  /*!\brief the types of post processing to be done, should be combination of
+   * "vp8_postproc_level" */
+  int post_proc_flag;
+  int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
+  int noise_level; /**< the strength of additive noise, valid range [0, 16] */
 } vp8_postproc_cfg_t;
 
 /*!\brief reference frame type
@@ -106,8 +112,8 @@ typedef enum vpx_ref_frame_type {
  * Define the data struct to access vp8 reference frames.
  */
 typedef struct vpx_ref_frame {
-  vpx_ref_frame_type_t  frame_type;   /**< which reference frame */
-  vpx_image_t           img;          /**< reference frame data in image format */
+  vpx_ref_frame_type_t frame_type; /**< which reference frame */
+  vpx_image_t img;                 /**< reference frame data in image format */
 } vpx_ref_frame_t;
 
 /*!\brief VP9 specific reference frame data struct
@@ -115,8 +121,8 @@ typedef struct vpx_ref_frame {
  * Define the data struct to access vp9 reference frames.
  */
 typedef struct vp9_ref_frame {
-  int idx; /**< frame index to get (input) */
-  vpx_image_t  img; /**< img structure to populate (output) */
+  int idx;         /**< frame index to get (input) */
+  vpx_image_t img; /**< img structure to populate (output) */
 } vp9_ref_frame_t;
 
 /*!\cond */
@@ -124,23 +130,23 @@ typedef struct vp9_ref_frame {
  *
  * defines the data type for each of VP8 decoder control function requires
  */
-VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE,           vpx_ref_frame_t *)
+VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *)
 #define VPX_CTRL_VP8_SET_REFERENCE
-VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE,          vpx_ref_frame_t *)
+VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *)
 #define VPX_CTRL_VP8_COPY_REFERENCE
-VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC,            vp8_postproc_cfg_t *)
+VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *)
 #define VPX_CTRL_VP8_SET_POSTPROC
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
 #define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int)
 #define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int)
 #define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES
-VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)
+VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int)
 #define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
-VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
+VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
 #define VPX_CTRL_VP9_GET_REFERENCE
-VPX_CTRL_USE_TYPE(VP10_GET_NEW_FRAME_IMAGE,    vpx_image_t *)
+VPX_CTRL_USE_TYPE(VP10_GET_NEW_FRAME_IMAGE, vpx_image_t *)
 #define VPX_CTRL_VP10_GET_NEW_FRAME_IMAGE
 
 /*!\endcond */
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 954ad3295351a1f3654f0904b24dac52acc19e7d..3b410580182f7bceadf6abb8d38d24f4223c60cc 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -32,7 +32,7 @@ extern "C" {
  * This interface provides the capability to encode raw VP10 streams.
  * @{
  */
-extern vpx_codec_iface_t  vpx_codec_vp10_cx_algo;
+extern vpx_codec_iface_t vpx_codec_vp10_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -46,8 +46,7 @@ extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
  * predictor. When not set, the encoder will choose whether to use the
  * last frame or not automatically.
  */
-#define VP8_EFLAG_NO_REF_LAST      (1<<16)
-
+#define VP8_EFLAG_NO_REF_LAST (1 << 16)
 
 /*!\brief Don't reference the golden frame
  *
@@ -55,8 +54,7 @@ extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
  * predictor. When not set, the encoder will choose whether to use the
  * golden frame or not automatically.
  */
-#define VP8_EFLAG_NO_REF_GF        (1<<17)
-
+#define VP8_EFLAG_NO_REF_GF (1 << 17)
 
 /*!\brief Don't reference the alternate reference frame
  *
@@ -64,56 +62,49 @@ extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
  * predictor. When not set, the encoder will choose whether to use the
  * alt ref frame or not automatically.
  */
-#define VP8_EFLAG_NO_REF_ARF       (1<<21)
-
+#define VP8_EFLAG_NO_REF_ARF (1 << 21)
 
 /*!\brief Don't update the last frame
  *
  * When this flag is set, the encoder will not update the last frame with
  * the contents of the current frame.
  */
-#define VP8_EFLAG_NO_UPD_LAST      (1<<18)
-
+#define VP8_EFLAG_NO_UPD_LAST (1 << 18)
 
 /*!\brief Don't update the golden frame
  *
  * When this flag is set, the encoder will not update the golden frame with
  * the contents of the current frame.
  */
-#define VP8_EFLAG_NO_UPD_GF        (1<<22)
-
+#define VP8_EFLAG_NO_UPD_GF (1 << 22)
 
 /*!\brief Don't update the alternate reference frame
  *
  * When this flag is set, the encoder will not update the alt ref frame with
  * the contents of the current frame.
  */
-#define VP8_EFLAG_NO_UPD_ARF       (1<<23)
-
+#define VP8_EFLAG_NO_UPD_ARF (1 << 23)
 
 /*!\brief Force golden frame update
  *
  * When this flag is set, the encoder copy the contents of the current frame
  * to the golden frame buffer.
  */
-#define VP8_EFLAG_FORCE_GF         (1<<19)
-
+#define VP8_EFLAG_FORCE_GF (1 << 19)
 
 /*!\brief Force alternate reference frame update
  *
  * When this flag is set, the encoder copy the contents of the current frame
  * to the alternate reference frame buffer.
  */
-#define VP8_EFLAG_FORCE_ARF        (1<<24)
-
+#define VP8_EFLAG_FORCE_ARF (1 << 24)
 
 /*!\brief Disable entropy update
  *
  * When this flag is set, the encoder will not update its internal entropy
  * model based on the entropy of this frame.
  */
-#define VP8_EFLAG_NO_UPD_ENTROPY   (1<<20)
-
+#define VP8_EFLAG_NO_UPD_ENTROPY (1 << 20)
 
 /*!\brief VPx encoder control functions
  *
@@ -127,13 +118,13 @@ enum vp8e_enc_control_id {
    *
    * Supported in codecs: VP8, VP9
    */
-  VP8E_USE_REFERENCE         = 7,
+  VP8E_USE_REFERENCE = 7,
 
   /*!\brief Codec control function to pass an ROI map to encoder.
    *
    * Supported in codecs: VP8, VP9
    */
-  VP8E_SET_ROI_MAP           = 8,
+  VP8E_SET_ROI_MAP = 8,
 
   /*!\brief Codec control function to pass an Active map to encoder.
    *
@@ -145,7 +136,7 @@ enum vp8e_enc_control_id {
    *
    * Supported in codecs: VP8, VP9
    */
-  VP8E_SET_SCALEMODE         = 11,
+  VP8E_SET_SCALEMODE = 11,
 
   /*!\brief Codec control function to set encoder internal speed settings.
    *
@@ -158,7 +149,7 @@ enum vp8e_enc_control_id {
    *
    * Supported in codecs: VP8, VP9
    */
-  VP8E_SET_CPUUSED           = 13,
+  VP8E_SET_CPUUSED = 13,
 
   /*!\brief Codec control function to enable automatic set and use alf frames.
    *
@@ -510,10 +501,10 @@ enum vp8e_enc_control_id {
  * This set of constants define 1-D vpx scaling modes
  */
 typedef enum vpx_scaling_mode_1d {
-  VP8E_NORMAL      = 0,
-  VP8E_FOURFIVE    = 1,
-  VP8E_THREEFIVE   = 2,
-  VP8E_ONETWO      = 3
+  VP8E_NORMAL = 0,
+  VP8E_FOURFIVE = 1,
+  VP8E_THREEFIVE = 2,
+  VP8E_ONETWO = 3
 } VPX_SCALING_MODE;
 
 /*!\brief  vpx region of interest map
@@ -525,13 +516,13 @@ typedef enum vpx_scaling_mode_1d {
 typedef struct vpx_roi_map {
   /*! An id between 0 and 3 for each 16x16 region within a frame. */
   unsigned char *roi_map;
-  unsigned int rows;       /**< Number of rows. */
-  unsigned int cols;       /**< Number of columns. */
+  unsigned int rows; /**< Number of rows. */
+  unsigned int cols; /**< Number of columns. */
   // TODO(paulwilkins): broken for VP9 which has 8 segments
   // q and loop filter deltas for each segment
   // (see MAX_MB_SEGMENTS)
-  int delta_q[4];          /**< Quantizer deltas. */
-  int delta_lf[4];         /**< Loop filter deltas. */
+  int delta_q[4];  /**< Quantizer deltas. */
+  int delta_lf[4]; /**< Loop filter deltas. */
   /*! Static breakout threshold for each segment. */
   unsigned int static_threshold[4];
 } vpx_roi_map_t;
@@ -542,11 +533,11 @@ typedef struct vpx_roi_map {
  *
  */
 
-
 typedef struct vpx_active_map {
-  unsigned char  *active_map; /**< specify an on (1) or off (0) each 16x16 region within a frame */
-  unsigned int    rows;       /**< number of rows */
-  unsigned int    cols;       /**< number of cols */
+  /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
+  unsigned char *active_map;
+  unsigned int rows; /**< number of rows */
+  unsigned int cols; /**< number of cols */
 } vpx_active_map_t;
 
 /*!\brief  vpx image scaling mode
@@ -555,8 +546,8 @@ typedef struct vpx_active_map {
  *
  */
 typedef struct vpx_scaling_mode {
-  VPX_SCALING_MODE    h_scaling_mode;  /**< horizontal scaling mode */
-  VPX_SCALING_MODE    v_scaling_mode;  /**< vertical scaling mode   */
+  VPX_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
+  VPX_SCALING_MODE v_scaling_mode; /**< vertical scaling mode   */
 } vpx_scaling_mode_t;
 
 /*!\brief VP8 token partition mode
@@ -567,9 +558,9 @@ typedef struct vpx_scaling_mode {
  */
 
 typedef enum {
-  VP8_ONE_TOKENPARTITION   = 0,
-  VP8_TWO_TOKENPARTITION   = 1,
-  VP8_FOUR_TOKENPARTITION  = 2,
+  VP8_ONE_TOKENPARTITION = 0,
+  VP8_TWO_TOKENPARTITION = 1,
+  VP8_FOUR_TOKENPARTITION = 2,
   VP8_EIGHT_TOKENPARTITION = 3
 } vp8e_token_partitions;
 
@@ -585,10 +576,7 @@ typedef enum {
  * Changes the encoder to tune for certain types of input material.
  *
  */
-typedef enum {
-  VPX_TUNE_PSNR,
-  VPX_TUNE_SSIM
-} vpx_tune_metric;
+typedef enum { VPX_TUNE_PSNR, VPX_TUNE_SSIM } vpx_tune_metric;
 
 /*!\cond */
 /*!\brief VP8 encoder control function parameter type
@@ -600,53 +588,53 @@ typedef enum {
 
 VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_USE_REFERENCE, int)
 #define VPX_CTRL_VP8E_USE_REFERENCE
-VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS,        int)
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
 #define VPX_CTRL_VP8E_SET_FRAME_FLAGS
-VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP,            vpx_roi_map_t *)
+VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
-VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP,          vpx_active_map_t *)
+VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
-VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE,          vpx_scaling_mode_t *)
+VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
 #define VPX_CTRL_VP8E_SET_SCALEMODE
 
-VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED,            int)
+VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int)
 #define VPX_CTRL_VP8E_SET_CPUUSED
-VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF,   unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int)
 #define VPX_CTRL_VP8E_SET_ENABLEAUTOALTREF
 
 #if CONFIG_EXT_REFS
-VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOBWDREF,   unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOBWDREF, unsigned int)
 #define VPX_CTRL_VP8E_SET_ENABLEAUTOBWDREF
 #endif  // CONFIG_EXT_REFS
 
-VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY,  unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY, unsigned int)
 #define VPX_CTRL_VP8E_SET_NOISE_SENSITIVITY
-VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS,          unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS, unsigned int)
 #define VPX_CTRL_VP8E_SET_SHARPNESS
-VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD,   unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int)
 #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD
-VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   int) /* vp8e_token_partitions */
+VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */
 #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS
 
-VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int)
 #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES
-VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH,     unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int)
 #define VPX_CTRL_VP8E_SET_ARNR_STRENGTH
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE,     unsigned int)
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE, unsigned int)
 #define VPX_CTRL_VP8E_SET_ARNR_TYPE
-VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             int) /* vpx_tune_metric */
+VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vpx_tune_metric */
 #define VPX_CTRL_VP8E_SET_TUNING
-VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL,      unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int)
 #define VPX_CTRL_VP8E_SET_CQ_LEVEL
 
-VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS,  int)
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
 #define VPX_CTRL_VP9E_SET_TILE_COLUMNS
-VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS,  int)
+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
 #define VPX_CTRL_VP9E_SET_TILE_ROWS
 
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
 #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
 #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
 
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
@@ -672,7 +660,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST
 
-VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY,  unsigned int)
+VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int)
 #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY
 
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vpx_tune_content */
@@ -681,10 +669,10 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vpx_tune_content */
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_SPACE
 
-VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL,  unsigned int)
+VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
 
-VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL,  unsigned int)
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
 
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
@@ -703,7 +691,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 VPX_CTRL_USE_TYPE(VP10E_SET_SUPERBLOCK_SIZE, unsigned int)
 #define VPX_CTRL_VP10E_SET_SUPERBLOCK_SIZE
 
-VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL,  unsigned int)
+VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
 
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index a8c411af3ff6fff45e35d071b04d6415eac14964..2239b8698e1d3f38643acb49ab77ab957092b703 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 /*!\defgroup vp8_decoder WebM VP8/VP9 Decoder
  * \ingroup vp8
  *
@@ -33,7 +32,7 @@ extern "C" {
  * This interface provides the capability to decode VP10 streams.
  * @{
  */
-extern vpx_codec_iface_t  vpx_codec_vp10_dx_algo;
+extern vpx_codec_iface_t vpx_codec_vp10_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp10_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -126,18 +125,17 @@ typedef void (*vpx_decrypt_cb)(void *decrypt_state, const unsigned char *input,
  * Defines a structure to hold the decryption state and access function.
  */
 typedef struct vpx_decrypt_init {
-    /*! Decrypt callback. */
-    vpx_decrypt_cb decrypt_cb;
+  /*! Decrypt callback. */
+  vpx_decrypt_cb decrypt_cb;
 
-    /*! Decryption state. */
-    void *decrypt_state;
+  /*! Decryption state. */
+  void *decrypt_state;
 } vpx_decrypt_init;
 
 /*!\brief A deprecated alias for vpx_decrypt_init.
  */
 typedef vpx_decrypt_init vp8_decrypt_init;
 
-
 /*!\cond */
 /*!\brief VP8 decoder control function parameter type
  *
@@ -146,28 +144,27 @@ typedef vpx_decrypt_init vp8_decrypt_init;
  *
  */
 
-
-VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,    int *)
+VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *)
 #define VPX_CTRL_VP8D_GET_LAST_REF_UPDATES
-VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,     int *)
+VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *)
 #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED
-VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,       int *)
+VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *)
 #define VPX_CTRL_VP8D_GET_LAST_REF_USED
-VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR,           vpx_decrypt_init *)
+VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VPXD_SET_DECRYPTOR
-VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR,           vpx_decrypt_init *)
+VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VP8D_SET_DECRYPTOR
-VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE,        int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE
-VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH,           unsigned int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
 #define VPX_CTRL_VP9D_GET_BIT_DEPTH
-VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE,          int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_ROW,     int)
+VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_ROW, int)
 #define VPX_CTRL_VP10_SET_DECODE_TILE_ROW
-VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_COL,     int)
+VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_COL, int)
 #define VPX_CTRL_VP10_SET_DECODE_TILE_COL
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index e65e3f41f53971e2d3bdb9eb7d039abb2100934b..107469fbe6505604c8a3838ac17069d310e904ce 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 /*!\defgroup codec Common Algorithm Interface
  * This abstraction allows applications to easily support multiple video
  * formats with minimal code duplication. This section describes the interface
@@ -46,446 +45,431 @@ extern "C" {
 #include "./vpx_integer.h"
 #include "./vpx_image.h"
 
-  /*!\brief Decorator indicating a function is deprecated */
+/*!\brief Decorator indicating a function is deprecated */
 #ifndef DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define DEPRECATED          __attribute__ ((deprecated))
+#define DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
 #define DEPRECATED
 #else
 #define DEPRECATED
 #endif
-#endif  /* DEPRECATED */
+#endif /* DEPRECATED */
 
 #ifndef DECLSPEC_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
 #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #elif defined(_MSC_VER)
-#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */
+/*!\brief \copydoc #DEPRECATED */
+#define DECLSPEC_DEPRECATED __declspec(deprecated)
 #else
 #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #endif
-#endif  /* DECLSPEC_DEPRECATED */
+#endif /* DECLSPEC_DEPRECATED */
 
-  /*!\brief Decorator indicating a function is potentially unused */
+/*!\brief Decorator indicating a function is potentially unused */
 #ifdef UNUSED
 #elif defined(__GNUC__) || defined(__clang__)
-#define UNUSED __attribute__ ((unused))
+#define UNUSED __attribute__((unused))
 #else
 #define UNUSED
 #endif
 
-  /*!\brief Current ABI version number
-   *
-   * \internal
-   * If this file is altered in any way that changes the ABI, this value
-   * must be bumped.  Examples include, but are not limited to, changing
-   * types, removing or reassigning enums, adding/removing/rearranging
-   * fields to structures
-   */
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
 #define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
-  /*!\brief Algorithm return codes */
-  typedef enum {
-    /*!\brief Operation completed without error */
-    VPX_CODEC_OK,
-
-    /*!\brief Unspecified error */
-    VPX_CODEC_ERROR,
-
-    /*!\brief Memory operation failed */
-    VPX_CODEC_MEM_ERROR,
-
-    /*!\brief ABI version mismatch */
-    VPX_CODEC_ABI_MISMATCH,
-
-    /*!\brief Algorithm does not have required capability */
-    VPX_CODEC_INCAPABLE,
-
-    /*!\brief The given bitstream is not supported.
-     *
-     * The bitstream was unable to be parsed at the highest level. The decoder
-     * is unable to proceed. This error \ref SHOULD be treated as fatal to the
-     * stream. */
-    VPX_CODEC_UNSUP_BITSTREAM,
-
-    /*!\brief Encoded bitstream uses an unsupported feature
-     *
-     * The decoder does not implement a feature required by the encoder. This
-     * return code should only be used for features that prevent future
-     * pictures from being properly decoded. This error \ref MAY be treated as
-     * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
-     */
-    VPX_CODEC_UNSUP_FEATURE,
-
-    /*!\brief The coded data for this stream is corrupt or incomplete
-     *
-     * There was a problem decoding the current frame.  This return code
-     * should only be used for failures that prevent future pictures from
-     * being properly decoded. This error \ref MAY be treated as fatal to the
-     * stream or \ref MAY be treated as fatal to the current GOP. If decoding
-     * is continued for the current GOP, artifacts may be present.
-     */
-    VPX_CODEC_CORRUPT_FRAME,
-
-    /*!\brief An application-supplied parameter is not valid.
-     *
-     */
-    VPX_CODEC_INVALID_PARAM,
-
-    /*!\brief An iterator reached the end of list.
-     *
-     */
-    VPX_CODEC_LIST_END
-
-  }
-  vpx_codec_err_t;
-
-
-  /*! \brief Codec capabilities bitfield
-   *
-   *  Each codec advertises the capabilities it supports as part of its
-   *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
-   *  or functionality, and are not required to be supported.
-   *
-   *  The available flags are specified by VPX_CODEC_CAP_* defines.
-   */
-  typedef long vpx_codec_caps_t;
-#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
-#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
+/*!\brief Algorithm return codes */
+typedef enum {
+  /*!\brief Operation completed without error */
+  VPX_CODEC_OK,
 
+  /*!\brief Unspecified error */
+  VPX_CODEC_ERROR,
 
-  /*! \brief Initialization-time Feature Enabling
-   *
-   *  Certain codec features must be known at initialization time, to allow for
-   *  proper memory allocation.
-   *
-   *  The available flags are specified by VPX_CODEC_USE_* defines.
-   */
-  typedef long vpx_codec_flags_t;
-
+  /*!\brief Memory operation failed */
+  VPX_CODEC_MEM_ERROR,
 
-  /*!\brief Codec interface structure.
-   *
-   * Contains function pointers and other data private to the codec
-   * implementation. This structure is opaque to the application.
-   */
-  typedef const struct vpx_codec_iface vpx_codec_iface_t;
+  /*!\brief ABI version mismatch */
+  VPX_CODEC_ABI_MISMATCH,
 
+  /*!\brief Algorithm does not have required capability */
+  VPX_CODEC_INCAPABLE,
 
-  /*!\brief Codec private data structure.
+  /*!\brief The given bitstream is not supported.
    *
-   * Contains data private to the codec implementation. This structure is opaque
-   * to the application.
-   */
-  typedef       struct vpx_codec_priv  vpx_codec_priv_t;
-
+   * The bitstream was unable to be parsed at the highest level. The decoder
+   * is unable to proceed. This error \ref SHOULD be treated as fatal to the
+   * stream. */
+  VPX_CODEC_UNSUP_BITSTREAM,
 
-  /*!\brief Iterator
+  /*!\brief Encoded bitstream uses an unsupported feature
    *
-   * Opaque storage used for iterating over lists.
+   * The decoder does not implement a feature required by the encoder. This
+   * return code should only be used for features that prevent future
+   * pictures from being properly decoded. This error \ref MAY be treated as
+   * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
    */
-  typedef const void *vpx_codec_iter_t;
-
+  VPX_CODEC_UNSUP_FEATURE,
 
-  /*!\brief Codec context structure
+  /*!\brief The coded data for this stream is corrupt or incomplete
    *
-   * All codecs \ref MUST support this context structure fully. In general,
-   * this data should be considered private to the codec algorithm, and
-   * not be manipulated or examined by the calling application. Applications
-   * may reference the 'name' member to get a printable description of the
-   * algorithm.
+   * There was a problem decoding the current frame.  This return code
+   * should only be used for failures that prevent future pictures from
+   * being properly decoded. This error \ref MAY be treated as fatal to the
+   * stream or \ref MAY be treated as fatal to the current GOP. If decoding
+   * is continued for the current GOP, artifacts may be present.
    */
-  typedef struct vpx_codec_ctx {
-    const char              *name;        /**< Printable interface name */
-    vpx_codec_iface_t       *iface;       /**< Interface pointers */
-    vpx_codec_err_t          err;         /**< Last returned error */
-    const char              *err_detail;  /**< Detailed info, if available */
-    vpx_codec_flags_t        init_flags;  /**< Flags passed at init time */
-    union {
-      /**< Decoder Configuration Pointer */
-      const struct vpx_codec_dec_cfg *dec;
-      /**< Encoder Configuration Pointer */
-      const struct vpx_codec_enc_cfg *enc;
-      const void                     *raw;
-    }                        config;      /**< Configuration pointer aliasing union */
-    vpx_codec_priv_t        *priv;        /**< Algorithm private storage */
-  } vpx_codec_ctx_t;
-
-  /*!\brief Bit depth for codec
-   * *
-   * This enumeration determines the bit depth of the codec.
-   */
-  typedef enum vpx_bit_depth {
-    VPX_BITS_8  =  8,  /**<  8 bits */
-    VPX_BITS_10 = 10,  /**< 10 bits */
-    VPX_BITS_12 = 12,  /**< 12 bits */
-  } vpx_bit_depth_t;
+  VPX_CODEC_CORRUPT_FRAME,
 
-  /*!\brief Superblock size selection.
+  /*!\brief An application-supplied parameter is not valid.
    *
-   * Defines the superblock size used for encoding. The superblock size can
-   * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
-   * selected by the encoder for each frame.
-   */
-  typedef enum vpx_superblock_size {
-    VPX_SUPERBLOCK_SIZE_64X64,    /**< Always use 64x64 superblocks. */
-    VPX_SUPERBLOCK_SIZE_128X128,  /**< Always use 128x128 superblocks. */
-    VPX_SUPERBLOCK_SIZE_DYNAMIC   /**< Select superblock size dynamically. */
-  } vpx_superblock_size_t;
-
-  /*
-   * Library Version Number Interface
-   *
-   * For example, see the following sample return values:
-   *     vpx_codec_version()           (1<<16 | 2<<8 | 3)
-   *     vpx_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
-   *     vpx_codec_version_extra_str() "rc1-16-gec6a1ba"
    */
+  VPX_CODEC_INVALID_PARAM,
 
-  /*!\brief Return the version information (as an integer)
-   *
-   * Returns a packed encoding of the library version number. This will only include
-   * the major.minor.patch component of the version number. Note that this encoded
-   * value should be accessed through the macros provided, as the encoding may change
-   * in the future.
+  /*!\brief An iterator reached the end of list.
    *
    */
-  int vpx_codec_version(void);
-#define VPX_VERSION_MAJOR(v) ((v>>16)&0xff) /**< extract major from packed version */
-#define VPX_VERSION_MINOR(v) ((v>>8)&0xff)  /**< extract minor from packed version */
-#define VPX_VERSION_PATCH(v) ((v>>0)&0xff)  /**< extract patch from packed version */
+  VPX_CODEC_LIST_END
 
-  /*!\brief Return the version major number */
-#define vpx_codec_version_major() ((vpx_codec_version()>>16)&0xff)
+} vpx_codec_err_t;
 
-  /*!\brief Return the version minor number */
-#define vpx_codec_version_minor() ((vpx_codec_version()>>8)&0xff)
+/*! \brief Codec capabilities bitfield
+ *
+ *  Each codec advertises the capabilities it supports as part of its
+ *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  or functionality, and are not required to be supported.
+ *
+ *  The available flags are specified by VPX_CODEC_CAP_* defines.
+ */
+typedef long vpx_codec_caps_t;
+#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
+#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
 
-  /*!\brief Return the version patch number */
-#define vpx_codec_version_patch() ((vpx_codec_version()>>0)&0xff)
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow for
+ *  proper memory allocation.
+ *
+ *  The available flags are specified by VPX_CODEC_USE_* defines.
+ */
+typedef long vpx_codec_flags_t;
 
+/*!\brief Codec interface structure.
+ *
+ * Contains function pointers and other data private to the codec
+ * implementation. This structure is opaque to the application.
+ */
+typedef const struct vpx_codec_iface vpx_codec_iface_t;
 
-  /*!\brief Return the version information (as a string)
-   *
-   * Returns a printable string containing the full library version number. This may
-   * contain additional text following the three digit version number, as to indicate
-   * release candidates, prerelease versions, etc.
-   *
-   */
-  const char *vpx_codec_version_str(void);
+/*!\brief Codec private data structure.
+ *
+ * Contains data private to the codec implementation. This structure is opaque
+ * to the application.
+ */
+typedef struct vpx_codec_priv vpx_codec_priv_t;
 
+/*!\brief Iterator
+ *
+ * Opaque storage used for iterating over lists.
+ */
+typedef const void *vpx_codec_iter_t;
 
-  /*!\brief Return the version information (as a string)
-   *
-   * Returns a printable "extra string". This is the component of the string returned
-   * by vpx_codec_version_str() following the three digit version number.
-   *
-   */
-  const char *vpx_codec_version_extra_str(void);
+/*!\brief Codec context structure
+ *
+ * All codecs \ref MUST support this context structure fully. In general,
+ * this data should be considered private to the codec algorithm, and
+ * not be manipulated or examined by the calling application. Applications
+ * may reference the 'name' member to get a printable description of the
+ * algorithm.
+ */
+typedef struct vpx_codec_ctx {
+  const char *name;             /**< Printable interface name */
+  vpx_codec_iface_t *iface;     /**< Interface pointers */
+  vpx_codec_err_t err;          /**< Last returned error */
+  const char *err_detail;       /**< Detailed info, if available */
+  vpx_codec_flags_t init_flags; /**< Flags passed at init time */
+  union {
+    /**< Decoder Configuration Pointer */
+    const struct vpx_codec_dec_cfg *dec;
+    /**< Encoder Configuration Pointer */
+    const struct vpx_codec_enc_cfg *enc;
+    const void *raw;
+  } config;               /**< Configuration pointer aliasing union */
+  vpx_codec_priv_t *priv; /**< Algorithm private storage */
+} vpx_codec_ctx_t;
+
+/*!\brief Bit depth for codec
+ * *
+ * This enumeration determines the bit depth of the codec.
+ */
+typedef enum vpx_bit_depth {
+  VPX_BITS_8 = 8,   /**<  8 bits */
+  VPX_BITS_10 = 10, /**< 10 bits */
+  VPX_BITS_12 = 12, /**< 12 bits */
+} vpx_bit_depth_t;
 
+/*!\brief Superblock size selection.
+ *
+ * Defines the superblock size used for encoding. The superblock size can
+ * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+ * selected by the encoder for each frame.
+ */
+typedef enum vpx_superblock_size {
+  VPX_SUPERBLOCK_SIZE_64X64,   /**< Always use 64x64 superblocks. */
+  VPX_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
+  VPX_SUPERBLOCK_SIZE_DYNAMIC  /**< Select superblock size dynamically. */
+} vpx_superblock_size_t;
 
-  /*!\brief Return the build configuration
-   *
-   * Returns a printable string containing an encoded version of the build
-   * configuration. This may be useful to vpx support.
-   *
-   */
-  const char *vpx_codec_build_config(void);
+/*
+ * Library Version Number Interface
+ *
+ * For example, see the following sample return values:
+ *     vpx_codec_version()           (1<<16 | 2<<8 | 3)
+ *     vpx_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
+ *     vpx_codec_version_extra_str() "rc1-16-gec6a1ba"
+ */
 
+/*!\brief Return the version information (as an integer)
+ *
+ * Returns a packed encoding of the library version number. This will only
+ * include
+ * the major.minor.patch component of the version number. Note that this encoded
+ * value should be accessed through the macros provided, as the encoding may
+ * change
+ * in the future.
+ *
+ */
+int vpx_codec_version(void);
+#define VPX_VERSION_MAJOR(v) \
+  ((v >> 16) & 0xff) /**< extract major from packed version */
+#define VPX_VERSION_MINOR(v) \
+  ((v >> 8) & 0xff) /**< extract minor from packed version */
+#define VPX_VERSION_PATCH(v) \
+  ((v >> 0) & 0xff) /**< extract patch from packed version */
 
-  /*!\brief Return the name for a given interface
-   *
-   * Returns a human readable string for name of the given codec interface.
-   *
-   * \param[in]    iface     Interface pointer
-   *
-   */
-  const char *vpx_codec_iface_name(vpx_codec_iface_t *iface);
+/*!\brief Return the version major number */
+#define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff)
 
+/*!\brief Return the version minor number */
+#define vpx_codec_version_minor() ((vpx_codec_version() >> 8) & 0xff)
 
-  /*!\brief Convert error number to printable string
-   *
-   * Returns a human readable string for the last error returned by the
-   * algorithm. The returned error will be one line and will not contain
-   * any newline characters.
-   *
-   *
-   * \param[in]    err     Error number.
-   *
-   */
-  const char *vpx_codec_err_to_string(vpx_codec_err_t  err);
+/*!\brief Return the version patch number */
+#define vpx_codec_version_patch() ((vpx_codec_version() >> 0) & 0xff)
 
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable string containing the full library version number. This
+ * may
+ * contain additional text following the three digit version number, as to
+ * indicate
+ * release candidates, prerelease versions, etc.
+ *
+ */
+const char *vpx_codec_version_str(void);
 
-  /*!\brief Retrieve error synopsis for codec context
-   *
-   * Returns a human readable string for the last error returned by the
-   * algorithm. The returned error will be one line and will not contain
-   * any newline characters.
-   *
-   *
-   * \param[in]    ctx     Pointer to this instance's context.
-   *
-   */
-  const char *vpx_codec_error(vpx_codec_ctx_t  *ctx);
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable "extra string". This is the component of the string
+ * returned
+ * by vpx_codec_version_str() following the three digit version number.
+ *
+ */
+const char *vpx_codec_version_extra_str(void);
 
+/*!\brief Return the build configuration
+ *
+ * Returns a printable string containing an encoded version of the build
+ * configuration. This may be useful to vpx support.
+ *
+ */
+const char *vpx_codec_build_config(void);
 
-  /*!\brief Retrieve detailed error information for codec context
-   *
-   * Returns a human readable string providing detailed information about
-   * the last error.
-   *
-   * \param[in]    ctx     Pointer to this instance's context.
-   *
-   * \retval NULL
-   *     No detailed information is available.
-   */
-  const char *vpx_codec_error_detail(vpx_codec_ctx_t  *ctx);
+/*!\brief Return the name for a given interface
+ *
+ * Returns a human readable string for name of the given codec interface.
+ *
+ * \param[in]    iface     Interface pointer
+ *
+ */
+const char *vpx_codec_iface_name(vpx_codec_iface_t *iface);
 
+/*!\brief Convert error number to printable string
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in]    err     Error number.
+ *
+ */
+const char *vpx_codec_err_to_string(vpx_codec_err_t err);
 
-  /* REQUIRED FUNCTIONS
-   *
-   * The following functions are required to be implemented for all codecs.
-   * They represent the base case functionality expected of all codecs.
-   */
+/*!\brief Retrieve error synopsis for codec context
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ *
+ */
+const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
 
-  /*!\brief Destroy a codec instance
-   *
-   * Destroys a codec context, freeing any associated memory buffers.
-   *
-   * \param[in] ctx   Pointer to this instance's context
-   *
-   * \retval #VPX_CODEC_OK
-   *     The codec algorithm initialized.
-   * \retval #VPX_CODEC_MEM_ERROR
-   *     Memory allocation failed.
-   */
-  vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
+/*!\brief Retrieve detailed error information for codec context
+ *
+ * Returns a human readable string providing detailed information about
+ * the last error.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ *
+ * \retval NULL
+ *     No detailed information is available.
+ */
+const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
 
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all codecs.
+ * They represent the base case functionality expected of all codecs.
+ */
 
-  /*!\brief Get the capabilities of an algorithm.
-   *
-   * Retrieves the capabilities bitfield from the algorithm's interface.
-   *
-   * \param[in] iface   Pointer to the algorithm interface
-   *
-   */
-  vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface);
+/*!\brief Destroy a codec instance
+ *
+ * Destroys a codec context, freeing any associated memory buffers.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ *
+ * \retval #VPX_CODEC_OK
+ *     The codec algorithm initialized.
+ * \retval #VPX_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
 
+/*!\brief Get the capabilities of an algorithm.
+ *
+ * Retrieves the capabilities bitfield from the algorithm's interface.
+ *
+ * \param[in] iface   Pointer to the algorithm interface
+ *
+ */
+vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface);
 
-  /*!\brief Control algorithm
-   *
-   * This function is used to exchange algorithm specific data with the codec
-   * instance. This can be used to implement features specific to a particular
-   * algorithm.
-   *
-   * This wrapper function dispatches the request to the helper function
-   * associated with the given ctrl_id. It tries to call this function
-   * transparently, but will return #VPX_CODEC_ERROR if the request could not
-   * be dispatched.
-   *
-   * Note that this function should not be used directly. Call the
-   * #vpx_codec_control wrapper macro instead.
-   *
-   * \param[in]     ctx              Pointer to this instance's context
-   * \param[in]     ctrl_id          Algorithm specific control identifier
-   *
-   * \retval #VPX_CODEC_OK
-   *     The control request was processed.
-   * \retval #VPX_CODEC_ERROR
-   *     The control request was not processed.
-   * \retval #VPX_CODEC_INVALID_PARAM
-   *     The data was not valid.
-   */
-  vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
-                                     int               ctrl_id,
-                                     ...);
+/*!\brief Control algorithm
+ *
+ * This function is used to exchange algorithm specific data with the codec
+ * instance. This can be used to implement features specific to a particular
+ * algorithm.
+ *
+ * This wrapper function dispatches the request to the helper function
+ * associated with the given ctrl_id. It tries to call this function
+ * transparently, but will return #VPX_CODEC_ERROR if the request could not
+ * be dispatched.
+ *
+ * Note that this function should not be used directly. Call the
+ * #vpx_codec_control wrapper macro instead.
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     ctrl_id          Algorithm specific control identifier
+ *
+ * \retval #VPX_CODEC_OK
+ *     The control request was processed.
+ * \retval #VPX_CODEC_ERROR
+ *     The control request was not processed.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     The data was not valid.
+ */
+vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
 #if defined(VPX_DISABLE_CTRL_TYPECHECKS) && VPX_DISABLE_CTRL_TYPECHECKS
-#    define vpx_codec_control(ctx,id,data) vpx_codec_control_(ctx,id,data)
-#    define VPX_CTRL_USE_TYPE(id, typ)
-#    define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)
-#    define VPX_CTRL_VOID(id, typ)
+#define vpx_codec_control(ctx, id, data) vpx_codec_control_(ctx, id, data)
+#define VPX_CTRL_USE_TYPE(id, typ)
+#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)
+#define VPX_CTRL_VOID(id, typ)
 
 #else
-  /*!\brief vpx_codec_control wrapper macro
-   *
-   * This macro allows for type safe conversions across the variadic parameter
-   * to vpx_codec_control_().
-   *
-   * \internal
-   * It works by dispatching the call to the control function through a wrapper
-   * function named with the id parameter.
-   */
-#    define vpx_codec_control(ctx,id,data) vpx_codec_control_##id(ctx,id,data)\
-  /**<\hideinitializer*/
-
+/*!\brief vpx_codec_control wrapper macro
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to vpx_codec_control_().
+ *
+ * \internal
+ * It works by dispatching the call to the control function through a wrapper
+ * function named with the id parameter.
+ */
+#define vpx_codec_control(ctx, id, data) \
+  vpx_codec_control_##id(ctx, id, data) /**<\hideinitializer*/
 
-  /*!\brief vpx_codec_control type definition macro
-   *
-   * This macro allows for type safe conversions across the variadic parameter
-   * to vpx_codec_control_(). It defines the type of the argument for a given
-   * control identifier.
-   *
-   * \internal
-   * It defines a static function with
-   * the correctly typed arguments as a wrapper to the type-unsafe internal
-   * function.
-   */
-#    define VPX_CTRL_USE_TYPE(id, typ) \
-  static vpx_codec_err_t \
-  vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) UNUSED;\
-  \
-  static vpx_codec_err_t \
-  vpx_codec_control_##id(vpx_codec_ctx_t  *ctx, int ctrl_id, typ data) {\
-    return vpx_codec_control_(ctx, ctrl_id, data);\
+/*!\brief vpx_codec_control type definition macro
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to vpx_codec_control_(). It defines the type of the argument for a given
+ * control identifier.
+ *
+ * \internal
+ * It defines a static function with
+ * the correctly typed arguments as a wrapper to the type-unsafe internal
+ * function.
+ */
+#define VPX_CTRL_USE_TYPE(id, typ)                                           \
+  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \
+      UNUSED;                                                                \
+                                                                             \
+  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,        \
+                                                int ctrl_id, typ data) {     \
+    return vpx_codec_control_(ctx, ctrl_id, data);                           \
   } /**<\hideinitializer*/
 
-
-  /*!\brief vpx_codec_control deprecated type definition macro
-   *
-   * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is
-   * deprecated and should not be used. Consult the documentation for your
-   * codec for more information.
-   *
-   * \internal
-   * It defines a static function with the correctly typed arguments as a
-   * wrapper to the type-unsafe internal function.
-   */
-#    define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t \
-  vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) DEPRECATED UNUSED;\
-  \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t \
-  vpx_codec_control_##id(vpx_codec_ctx_t  *ctx, int ctrl_id, typ data) {\
-    return vpx_codec_control_(ctx, ctrl_id, data);\
+/*!\brief vpx_codec_control deprecated type definition macro
+ *
+ * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is
+ * deprecated and should not be used. Consult the documentation for your
+ * codec for more information.
+ *
+ * \internal
+ * It defines a static function with the correctly typed arguments as a
+ * wrapper to the type-unsafe internal function.
+ */
+#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                        \
+  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED;                \
+                                                                     \
+  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                 \
+    return vpx_codec_control_(ctx, ctrl_id, data);                   \
   } /**<\hideinitializer*/
 
-
-  /*!\brief vpx_codec_control void type definition macro
-   *
-   * This macro allows for type safe conversions across the variadic parameter
-   * to vpx_codec_control_(). It indicates that a given control identifier takes
-   * no argument.
-   *
-   * \internal
-   * It defines a static function without a data argument as a wrapper to the
-   * type-unsafe internal function.
-   */
-#    define VPX_CTRL_VOID(id) \
-  static vpx_codec_err_t \
-  vpx_codec_control_##id(vpx_codec_ctx_t*, int) UNUSED;\
-  \
-  static vpx_codec_err_t \
-  vpx_codec_control_##id(vpx_codec_ctx_t  *ctx, int ctrl_id) {\
-    return vpx_codec_control_(ctx, ctrl_id);\
+/*!\brief vpx_codec_control void type definition macro
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to vpx_codec_control_(). It indicates that a given control identifier takes
+ * no argument.
+ *
+ * \internal
+ * It defines a static function without a data argument as a wrapper to the
+ * type-unsafe internal function.
+ */
+#define VPX_CTRL_VOID(id)                                               \
+  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \
+      UNUSED;                                                           \
+                                                                        \
+  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,   \
+                                                int ctrl_id) {          \
+    return vpx_codec_control_(ctx, ctrl_id);                            \
   } /**<\hideinitializer*/
 
-
 #endif
 
-  /*!@} - end defgroup codec*/
+/*!@} - end defgroup codec*/
 #ifdef __cplusplus
 }
 #endif
 #endif  // VPX_VPX_CODEC_H_
-
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index bfe90c6112e9c87fcf2bf0487cd8bb84a59f6a0b..3d8dd6ccb661396fb7269e1c26e13dd49bc19b71 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -32,347 +32,334 @@ extern "C" {
 #include "./vpx_codec.h"
 #include "./vpx_frame_buffer.h"
 
-  /*!\brief Current ABI version number
-   *
-   * \internal
-   * If this file is altered in any way that changes the ABI, this value
-   * must be bumped.  Examples include, but are not limited to, changing
-   * types, removing or reassigning enums, adding/removing/rearranging
-   * fields to structures
-   */
-#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
-
-  /*! \brief Decoder capabilities bitfield
-   *
-   *  Each decoder advertises the capabilities it supports as part of its
-   *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
-   *  or functionality, and are not required to be supported by a decoder.
-   *
-   *  The available flags are specified by VPX_CODEC_CAP_* defines.
-   */
-#define VPX_CODEC_CAP_PUT_SLICE  0x10000 /**< Will issue put_slice callbacks */
-#define VPX_CODEC_CAP_PUT_FRAME  0x20000 /**< Will issue put_frame callbacks */
-#define VPX_CODEC_CAP_POSTPROC   0x40000 /**< Can postprocess decoded frame */
-#define VPX_CODEC_CAP_ERROR_CONCEALMENT   0x80000 /**< Can conceal errors due to
-  packet loss */
-#define VPX_CODEC_CAP_INPUT_FRAGMENTS   0x100000 /**< Can receive encoded frames
-  one fragment at a time */
-
-  /*! \brief Initialization-time Feature Enabling
-   *
-   *  Certain codec features must be known at initialization time, to allow for
-   *  proper memory allocation.
-   *
-   *  The available flags are specified by VPX_CODEC_USE_* defines.
-   */
-#define VPX_CODEC_CAP_FRAME_THREADING   0x200000 /**< Can support frame-based
-                                                      multi-threading */
-#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external
-                                                          frame buffers */
-
-#define VPX_CODEC_USE_POSTPROC   0x10000 /**< Postprocess decoded frame */
-#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded
-  frames */
-#define VPX_CODEC_USE_INPUT_FRAGMENTS   0x40000 /**< The input frame should be
-  passed to the decoder one
-  fragment at a time */
-#define VPX_CODEC_USE_FRAME_THREADING   0x80000 /**< Enable frame-based
-                                                     multi-threading */
-
-  /*!\brief Stream properties
-   *
-   * This structure is used to query or set properties of the decoded
-   * stream. Algorithms may extend this structure with data specific
-   * to their bitstream by setting the sz member appropriately.
-   */
-  typedef struct vpx_codec_stream_info {
-    unsigned int sz;     /**< Size of this structure */
-    unsigned int w;      /**< Width (or 0 for unknown/default) */
-    unsigned int h;      /**< Height (or 0 for unknown/default) */
-    unsigned int is_kf;  /**< Current frame is a keyframe */
-  } vpx_codec_stream_info_t;
-
-  /* REQUIRED FUNCTIONS
-   *
-   * The following functions are required to be implemented for all decoders.
-   * They represent the base case functionality expected of all decoders.
-   */
-
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_DECODER_ABI_VERSION \
+  (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
-  /*!\brief Initialization Configurations
-   *
-   * This structure is used to pass init time configuration options to the
-   * decoder.
-   */
-  typedef struct vpx_codec_dec_cfg {
-    unsigned int threads; /**< Maximum number of threads to use, default 1 */
-    unsigned int w;      /**< Width */
-    unsigned int h;      /**< Height */
-  } vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */
+/*! \brief Decoder capabilities bitfield
+ *
+ *  Each decoder advertises the capabilities it supports as part of its
+ *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  or functionality, and are not required to be supported by a decoder.
+ *
+ *  The available flags are specified by VPX_CODEC_CAP_* defines.
+ */
+#define VPX_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
+#define VPX_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
+#define VPX_CODEC_CAP_POSTPROC 0x40000  /**< Can postprocess decoded frame */
+/*!\brief Can conceal errors due to packet loss */
+#define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000
+/*!\brief Can receive encoded frames one fragment at a time */
+#define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow for
+ *  proper memory allocation.
+ *
+ *  The available flags are specified by VPX_CODEC_USE_* defines.
+ */
+/*!\brief Can support frame-based multi-threading */
+#define VPX_CODEC_CAP_FRAME_THREADING 0x200000
+/*!brief Can support external frame buffers */
+#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
+
+#define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
+/*!\brief Conceal errors in decoded frames */
+#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000
+/*!\brief The input frame should be passed to the decoder one fragment at a
+ * time */
+#define VPX_CODEC_USE_INPUT_FRAGMENTS 0x40000
+/*!\brief Enable frame-based multi-threading */
+#define VPX_CODEC_USE_FRAME_THREADING 0x80000
+
+/*!\brief Stream properties
+ *
+ * This structure is used to query or set properties of the decoded
+ * stream. Algorithms may extend this structure with data specific
+ * to their bitstream by setting the sz member appropriately.
+ */
+typedef struct vpx_codec_stream_info {
+  unsigned int sz;    /**< Size of this structure */
+  unsigned int w;     /**< Width (or 0 for unknown/default) */
+  unsigned int h;     /**< Height (or 0 for unknown/default) */
+  unsigned int is_kf; /**< Current frame is a keyframe */
+} vpx_codec_stream_info_t;
+
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all decoders.
+ * They represent the base case functionality expected of all decoders.
+ */
 
+/*!\brief Initialization Configurations
+ *
+ * This structure is used to pass init time configuration options to the
+ * decoder.
+ */
+typedef struct vpx_codec_dec_cfg {
+  unsigned int threads; /**< Maximum number of threads to use, default 1 */
+  unsigned int w;       /**< Width */
+  unsigned int h;       /**< Height */
+} vpx_codec_dec_cfg_t;  /**< alias for struct vpx_codec_dec_cfg */
 
-  /*!\brief Initialize a decoder instance
-   *
-   * Initializes a decoder context using the given interface. Applications
-   * should call the vpx_codec_dec_init convenience macro instead of this
-   * function directly, to ensure that the ABI version number parameter
-   * is properly initialized.
-   *
-   * If the library was configured with --disable-multithread, this call
-   * is not thread safe and should be guarded with a lock if being used
-   * in a multithreaded context.
-   *
-   * \param[in]    ctx     Pointer to this instance's context.
-   * \param[in]    iface   Pointer to the algorithm interface to use.
-   * \param[in]    cfg     Configuration to use, if known. May be NULL.
-   * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
-   * \param[in]    ver     ABI version number. Must be set to
-   *                       VPX_DECODER_ABI_VERSION
-   * \retval #VPX_CODEC_OK
-   *     The decoder algorithm initialized.
-   * \retval #VPX_CODEC_MEM_ERROR
-   *     Memory allocation failed.
-   */
-  vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
-                                         vpx_codec_iface_t    *iface,
-                                         const vpx_codec_dec_cfg_t *cfg,
-                                         vpx_codec_flags_t     flags,
-                                         int                   ver);
+/*!\brief Initialize a decoder instance
+ *
+ * Initializes a decoder context using the given interface. Applications
+ * should call the vpx_codec_dec_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with --disable-multithread, this call
+ * is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       VPX_DECODER_ABI_VERSION
+ * \retval #VPX_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #VPX_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
+                                       vpx_codec_iface_t *iface,
+                                       const vpx_codec_dec_cfg_t *cfg,
+                                       vpx_codec_flags_t flags, int ver);
 
-  /*!\brief Convenience macro for vpx_codec_dec_init_ver()
-   *
-   * Ensures the ABI version parameter is properly set.
-   */
+/*!\brief Convenience macro for vpx_codec_dec_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
 #define vpx_codec_dec_init(ctx, iface, cfg, flags) \
   vpx_codec_dec_init_ver(ctx, iface, cfg, flags, VPX_DECODER_ABI_VERSION)
 
+/*!\brief Parse stream info from a buffer
+ *
+ * Performs high level parsing of the bitstream. Construction of a decoder
+ * context is not necessary. Can be used to determine if the bitstream is
+ * of the proper format, and to extract information from the stream.
+ *
+ * \param[in]      iface   Pointer to the algorithm interface
+ * \param[in]      data    Pointer to a block of data to parse
+ * \param[in]      data_sz Size of the data buffer
+ * \param[in,out]  si      Pointer to stream info to update. The size member
+ *                         \ref MUST be properly initialized, but \ref MAY be
+ *                         clobbered by the algorithm. This parameter \ref MAY
+ *                         be NULL.
+ *
+ * \retval #VPX_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface,
+                                           const uint8_t *data,
+                                           unsigned int data_sz,
+                                           vpx_codec_stream_info_t *si);
 
-  /*!\brief Parse stream info from a buffer
-   *
-   * Performs high level parsing of the bitstream. Construction of a decoder
-   * context is not necessary. Can be used to determine if the bitstream is
-   * of the proper format, and to extract information from the stream.
-   *
-   * \param[in]      iface   Pointer to the algorithm interface
-   * \param[in]      data    Pointer to a block of data to parse
-   * \param[in]      data_sz Size of the data buffer
-   * \param[in,out]  si      Pointer to stream info to update. The size member
-   *                         \ref MUST be properly initialized, but \ref MAY be
-   *                         clobbered by the algorithm. This parameter \ref MAY
-   *                         be NULL.
-   *
-   * \retval #VPX_CODEC_OK
-   *     Bitstream is parsable and stream information updated
-   */
-  vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t       *iface,
-                                             const uint8_t           *data,
-                                             unsigned int             data_sz,
-                                             vpx_codec_stream_info_t *si);
-
-
-  /*!\brief Return information about the current stream.
-   *
-   * Returns information about the stream that has been parsed during decoding.
-   *
-   * \param[in]      ctx     Pointer to this instance's context
-   * \param[in,out]  si      Pointer to stream info to update. The size member
-   *                         \ref MUST be properly initialized, but \ref MAY be
-   *                         clobbered by the algorithm. This parameter \ref MAY
-   *                         be NULL.
-   *
-   * \retval #VPX_CODEC_OK
-   *     Bitstream is parsable and stream information updated
-   */
-  vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t         *ctx,
-                                            vpx_codec_stream_info_t *si);
-
-
-  /*!\brief Decode data
-   *
-   * Processes a buffer of coded data. If the processing results in a new
-   * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
-   * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
-   * time stamp) order. Frames produced will always be in PTS (presentation
-   * time stamp) order.
-   * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled,
-   * data and data_sz can contain a fragment of the encoded frame. Fragment
-   * \#n must contain at least partition \#n, but can also contain subsequent
-   * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
-   * be empty. When no more data is available, this function should be called
-   * with NULL as data and 0 as data_sz. The memory passed to this function
-   * must be available until the frame has been decoded.
-   *
-   * \param[in] ctx          Pointer to this instance's context
-   * \param[in] data         Pointer to this block of new coded data. If
-   *                         NULL, a VPX_CODEC_CB_PUT_FRAME event is posted
-   *                         for the previously decoded frame.
-   * \param[in] data_sz      Size of the coded data, in bytes.
-   * \param[in] user_priv    Application specific data to associate with
-   *                         this frame.
-   * \param[in] deadline     Soft deadline the decoder should attempt to meet,
-   *                         in us. Set to zero for unlimited.
-   *
-   * \return Returns #VPX_CODEC_OK if the coded data was processed completely
-   *         and future pictures can be decoded without error. Otherwise,
-   *         see the descriptions of the other error codes in ::vpx_codec_err_t
-   *         for recoverability capabilities.
-   */
-  vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
-                                   const uint8_t        *data,
-                                   unsigned int            data_sz,
-                                   void               *user_priv,
-                                   long                deadline);
-
-
-  /*!\brief Decoded frames iterator
-   *
-   * Iterates over a list of the frames available for display. The iterator
-   * storage should be initialized to NULL to start the iteration. Iteration is
-   * complete when this function returns NULL.
-   *
-   * The list of available frames becomes valid upon completion of the
-   * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode.
-   *
-   * \param[in]     ctx      Pointer to this instance's context
-   * \param[in,out] iter     Iterator storage, initialized to NULL
-   *
-   * \return Returns a pointer to an image, if one is ready for display. Frames
-   *         produced will always be in PTS (presentation time stamp) order.
-   */
-  vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t  *ctx,
-                                   vpx_codec_iter_t *iter);
-
-
-  /*!\defgroup cap_put_frame Frame-Based Decoding Functions
-   *
-   * The following functions are required to be implemented for all decoders
-   * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these functions
-   * for codecs that don't advertise this capability will result in an error
-   * code being returned, usually VPX_CODEC_ERROR
-   * @{
-   */
-
-  /*!\brief put frame callback prototype
-   *
-   * This callback is invoked by the decoder to notify the application of
-   * the availability of decoded image data.
-   */
-  typedef void (*vpx_codec_put_frame_cb_fn_t)(void        *user_priv,
-                                              const vpx_image_t *img);
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in]      ctx     Pointer to this instance's context
+ * \param[in,out]  si      Pointer to stream info to update. The size member
+ *                         \ref MUST be properly initialized, but \ref MAY be
+ *                         clobbered by the algorithm. This parameter \ref MAY
+ *                         be NULL.
+ *
+ * \retval #VPX_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
+                                          vpx_codec_stream_info_t *si);
 
+/*!\brief Decode data
+ *
+ * Processes a buffer of coded data. If the processing results in a new
+ * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
+ * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
+ * time stamp) order. Frames produced will always be in PTS (presentation
+ * time stamp) order.
+ * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled,
+ * data and data_sz can contain a fragment of the encoded frame. Fragment
+ * \#n must contain at least partition \#n, but can also contain subsequent
+ * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
+ * be empty. When no more data is available, this function should be called
+ * with NULL as data and 0 as data_sz. The memory passed to this function
+ * must be available until the frame has been decoded.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] data         Pointer to this block of new coded data. If
+ *                         NULL, a VPX_CODEC_CB_PUT_FRAME event is posted
+ *                         for the previously decoded frame.
+ * \param[in] data_sz      Size of the coded data, in bytes.
+ * \param[in] user_priv    Application specific data to associate with
+ *                         this frame.
+ * \param[in] deadline     Soft deadline the decoder should attempt to meet,
+ *                         in us. Set to zero for unlimited.
+ *
+ * \return Returns #VPX_CODEC_OK if the coded data was processed completely
+ *         and future pictures can be decoded without error. Otherwise,
+ *         see the descriptions of the other error codes in ::vpx_codec_err_t
+ *         for recoverability capabilities.
+ */
+vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
+                                 unsigned int data_sz, void *user_priv,
+                                 long deadline);
 
-  /*!\brief Register for notification of frame completion.
-   *
-   * Registers a given function to be called when a decoded frame is
-   * available.
-   *
-   * \param[in] ctx          Pointer to this instance's context
-   * \param[in] cb           Pointer to the callback function
-   * \param[in] user_priv    User's private data
-   *
-   * \retval #VPX_CODEC_OK
-   *     Callback successfully registered.
-   * \retval #VPX_CODEC_ERROR
-   *     Decoder context not initialized, or algorithm not capable of
-   *     posting slice completion.
-   */
-  vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t             *ctx,
-                                                  vpx_codec_put_frame_cb_fn_t  cb,
-                                                  void                        *user_priv);
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * vpx_codec_decode call, and remains valid until the next call to
+ * vpx_codec_decode.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in,out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ *         produced will always be in PTS (presentation time stamp) order.
+ */
+vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter);
 
+/*!\defgroup cap_put_frame Frame-Based Decoding Functions
+ *
+ * The following functions are required to be implemented for all decoders
+ * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these
+ * functions
+ * for codecs that don't advertise this capability will result in an error
+ * code being returned, usually VPX_CODEC_ERROR
+ * @{
+ */
 
-  /*!@} - end defgroup cap_put_frame */
+/*!\brief put frame callback prototype
+ *
+ * This callback is invoked by the decoder to notify the application of
+ * the availability of decoded image data.
+ */
+typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv,
+                                            const vpx_image_t *img);
 
-  /*!\defgroup cap_put_slice Slice-Based Decoding Functions
-   *
-   * The following functions are required to be implemented for all decoders
-   * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these functions
-   * for codecs that don't advertise this capability will result in an error
-   * code being returned, usually VPX_CODEC_ERROR
-   * @{
-   */
+/*!\brief Register for notification of frame completion.
+ *
+ * Registers a given function to be called when a decoded frame is
+ * available.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb           Pointer to the callback function
+ * \param[in] user_priv    User's private data
+ *
+ * \retval #VPX_CODEC_OK
+ *     Callback successfully registered.
+ * \retval #VPX_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     posting slice completion.
+ */
+vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
+                                                vpx_codec_put_frame_cb_fn_t cb,
+                                                void *user_priv);
 
-  /*!\brief put slice callback prototype
-   *
-   * This callback is invoked by the decoder to notify the application of
-   * the availability of partially decoded image data. The
-   */
-  typedef void (*vpx_codec_put_slice_cb_fn_t)(void         *user_priv,
-                                              const vpx_image_t      *img,
-                                              const vpx_image_rect_t *valid,
-                                              const vpx_image_rect_t *update);
+/*!@} - end defgroup cap_put_frame */
 
+/*!\defgroup cap_put_slice Slice-Based Decoding Functions
+ *
+ * The following functions are required to be implemented for all decoders
+ * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these
+ * functions
+ * for codecs that don't advertise this capability will result in an error
+ * code being returned, usually VPX_CODEC_ERROR
+ * @{
+ */
 
-  /*!\brief Register for notification of slice completion.
-   *
-   * Registers a given function to be called when a decoded slice is
-   * available.
-   *
-   * \param[in] ctx          Pointer to this instance's context
-   * \param[in] cb           Pointer to the callback function
-   * \param[in] user_priv    User's private data
-   *
-   * \retval #VPX_CODEC_OK
-   *     Callback successfully registered.
-   * \retval #VPX_CODEC_ERROR
-   *     Decoder context not initialized, or algorithm not capable of
-   *     posting slice completion.
-   */
-  vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t             *ctx,
-                                                  vpx_codec_put_slice_cb_fn_t  cb,
-                                                  void                        *user_priv);
+/*!\brief put slice callback prototype
+ *
+ * This callback is invoked by the decoder to notify the application of
+ * the availability of partially decoded image data. The
+ */
+typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv,
+                                            const vpx_image_t *img,
+                                            const vpx_image_rect_t *valid,
+                                            const vpx_image_rect_t *update);
 
+/*!\brief Register for notification of slice completion.
+ *
+ * Registers a given function to be called when a decoded slice is
+ * available.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb           Pointer to the callback function
+ * \param[in] user_priv    User's private data
+ *
+ * \retval #VPX_CODEC_OK
+ *     Callback successfully registered.
+ * \retval #VPX_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     posting slice completion.
+ */
+vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
+                                                vpx_codec_put_slice_cb_fn_t cb,
+                                                void *user_priv);
 
-  /*!@} - end defgroup cap_put_slice*/
+/*!@} - end defgroup cap_put_slice*/
 
-  /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
-   *
-   * The following section is required to be implemented for all decoders
-   * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
-   * Calling this function for codecs that don't advertise this capability
-   * will result in an error code being returned, usually VPX_CODEC_ERROR.
-   *
-   * \note
-   * Currently this only works with VP9.
-   * @{
-   */
+/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+ *
+ * The following section is required to be implemented for all decoders
+ * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+ * Calling this function for codecs that don't advertise this capability
+ * will result in an error code being returned, usually VPX_CODEC_ERROR.
+ *
+ * \note
+ * Currently this only works with VP9.
+ * @{
+ */
 
-  /*!\brief Pass in external frame buffers for the decoder to use.
-   *
-   * Registers functions to be called when libvpx needs a frame buffer
-   * to decode the current frame and a function to be called when libvpx does
-   * not internally reference the frame buffer. This set function must
-   * be called before the first call to decode or libvpx will assume the
-   * default behavior of allocating frame buffers internally.
-   *
-   * \param[in] ctx          Pointer to this instance's context
-   * \param[in] cb_get       Pointer to the get callback function
-   * \param[in] cb_release   Pointer to the release callback function
-   * \param[in] cb_priv      Callback's private data
-   *
-   * \retval #VPX_CODEC_OK
-   *     External frame buffers will be used by libvpx.
-   * \retval #VPX_CODEC_INVALID_PARAM
-   *     One or more of the callbacks were NULL.
-   * \retval #VPX_CODEC_ERROR
-   *     Decoder context not initialized, or algorithm not capable of
-   *     using external frame buffers.
-   *
-   * \note
-   * When decoding VP9, the application may be required to pass in at least
-   * #VPX_MAXIMUM_WORK_BUFFERS external frame
-   * buffers.
-   */
-  vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
-      vpx_codec_ctx_t *ctx,
-      vpx_get_frame_buffer_cb_fn_t cb_get,
-      vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libvpx needs a frame buffer
+ * to decode the current frame and a function to be called when libvpx does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libvpx will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #VPX_CODEC_OK
+ *     External frame buffers will be used by libvpx.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #VPX_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding VP9, the application may be required to pass in at least
+ * #VPX_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
+    vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
+    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
 
-  /*!@} - end defgroup cap_external_frame_buffer */
+/*!@} - end defgroup cap_external_frame_buffer */
 
-  /*!@} - end defgroup decoder*/
+/*!@} - end defgroup decoder*/
 #ifdef __cplusplus
 }
 #endif
 #endif  // VPX_VPX_DECODER_H_
-
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 9a0f48ef824597b23c6282cb9ad9224b959364cb..62c3ce001e61e521d1632bab00a47d36dd4b60d3 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -31,869 +31,806 @@ extern "C" {
 
 #include "./vpx_codec.h"
 
-  /*!\brief Current ABI version number
-   *
-   * \internal
-   * If this file is altered in any way that changes the ABI, this value
-   * must be bumped.  Examples include, but are not limited to, changing
-   * types, removing or reassigning enums, adding/removing/rearranging
-   * fields to structures
-   */
-#define VPX_ENCODER_ABI_VERSION (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
-
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_ENCODER_ABI_VERSION \
+  (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
-  /*! \brief Encoder capabilities bitfield
-   *
-   *  Each encoder advertises the capabilities it supports as part of its
-   *  ::vpx_codec_iface_t interface structure. Capabilities are extra
-   *  interfaces or functionality, and are not required to be supported
-   *  by an encoder.
-   *
-   *  The available flags are specified by VPX_CODEC_CAP_* defines.
-   */
-#define VPX_CODEC_CAP_PSNR  0x10000 /**< Can issue PSNR packets */
+/*! \brief Encoder capabilities bitfield
+ *
+ *  Each encoder advertises the capabilities it supports as part of its
+ *  ::vpx_codec_iface_t interface structure. Capabilities are extra
+ *  interfaces or functionality, and are not required to be supported
+ *  by an encoder.
+ *
+ *  The available flags are specified by VPX_CODEC_CAP_* defines.
+ */
+#define VPX_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
 
-  /*! Can output one partition at a time. Each partition is returned in its
-   *  own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
-   *  every partition but the last. In this mode all frames are always
-   *  returned partition by partition.
-   */
-#define VPX_CODEC_CAP_OUTPUT_PARTITION  0x20000
+/*! Can output one partition at a time. Each partition is returned in its
+ *  own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
+ *  every partition but the last. In this mode all frames are always
+ *  returned partition by partition.
+ */
+#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000
 
 /*! Can support input images at greater than 8 bitdepth.
  */
-#define VPX_CODEC_CAP_HIGHBITDEPTH  0x40000
+#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000
 
-  /*! \brief Initialization-time Feature Enabling
-   *
-   *  Certain codec features must be known at initialization time, to allow
-   *  for proper memory allocation.
-   *
-   *  The available flags are specified by VPX_CODEC_USE_* defines.
-   */
-#define VPX_CODEC_USE_PSNR  0x10000 /**< Calculate PSNR on each frame */
-#define VPX_CODEC_USE_OUTPUT_PARTITION  0x20000 /**< Make the encoder output one
-  partition at a time. */
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow
+ *  for proper memory allocation.
+ *
+ *  The available flags are specified by VPX_CODEC_USE_* defines.
+ */
+#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
+/*!\brief Make the encoder output one  partition at a time. */
+#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000
 #define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
 
+/*!\brief Generic fixed size buffer structure
+ *
+ * This structure is able to hold a reference to any fixed size buffer.
+ */
+typedef struct vpx_fixed_buf {
+  void *buf;       /**< Pointer to the data */
+  size_t sz;       /**< Length of the buffer, in chars */
+} vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */
 
-  /*!\brief Generic fixed size buffer structure
-   *
-   * This structure is able to hold a reference to any fixed size buffer.
-   */
-  typedef struct vpx_fixed_buf {
-    void          *buf; /**< Pointer to the data */
-    size_t         sz;  /**< Length of the buffer, in chars */
-  } vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */
-
-
-  /*!\brief Time Stamp Type
-   *
-   * An integer, which when multiplied by the stream's time base, provides
-   * the absolute time of a sample.
-   */
-  typedef int64_t vpx_codec_pts_t;
-
-
-  /*!\brief Compressed Frame Flags
-   *
-   * This type represents a bitfield containing information about a compressed
-   * frame that may be useful to an application. The most significant 16 bits
-   * can be used by an algorithm to provide additional detail, for example to
-   * support frame types that are codec specific (MPEG-1 D-frames for example)
-   */
-  typedef uint32_t vpx_codec_frame_flags_t;
-#define VPX_FRAME_IS_KEY       0x1 /**< frame is the start of a GOP */
-#define VPX_FRAME_IS_DROPPABLE 0x2 /**< frame can be dropped without affecting
-  the stream (no future frame depends on
-              this one) */
-#define VPX_FRAME_IS_INVISIBLE 0x4 /**< frame should be decoded but will not
-  be shown */
-#define VPX_FRAME_IS_FRAGMENT  0x8 /**< this is a fragment of the encoded
-  frame */
-
-  /*!\brief Error Resilient flags
-   *
-   * These flags define which error resilient features to enable in the
-   * encoder. The flags are specified through the
-   * vpx_codec_enc_cfg::g_error_resilient variable.
-   */
-  typedef uint32_t vpx_codec_er_flags_t;
-#define VPX_ERROR_RESILIENT_DEFAULT     0x1 /**< Improve resiliency against
-  losses of whole frames */
-#define VPX_ERROR_RESILIENT_PARTITIONS  0x2 /**< The frame partitions are
-  independently decodable by the
-  bool decoder, meaning that
-  partitions can be decoded even
-  though earlier partitions have
-  been lost. Note that intra
-  prediction is still done over
-  the partition boundary. */
-
-  /*!\brief Encoder output packet variants
-   *
-   * This enumeration lists the different kinds of data packets that can be
-   * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY
-   * extend this list to provide additional functionality.
-   */
-  enum vpx_codec_cx_pkt_kind {
-    VPX_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
-    VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
-    VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
-    VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
-    VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
-  };
-
-
-  /*!\brief Encoder output packet
-   *
-   * This structure contains the different kinds of output data the encoder
-   * may produce while compressing a frame.
-   */
-  typedef struct vpx_codec_cx_pkt {
-    enum vpx_codec_cx_pkt_kind  kind; /**< packet variant */
-    union {
-      struct {
-        void                    *buf;      /**< compressed data buffer */
-        size_t                   sz;       /**< length of compressed data */
-        vpx_codec_pts_t          pts;      /**< time stamp to show frame
-                                                    (in timebase units) */
-        unsigned long            duration; /**< duration to show frame
-                                                    (in timebase units) */
-        vpx_codec_frame_flags_t  flags;    /**< flags for this frame */
-        int                      partition_id; /**< the partition id
-                                              defines the decoding order
-                                              of the partitions. Only
-                                              applicable when "output partition"
-                                              mode is enabled. First partition
-                                              has id 0.*/
-
-      } frame;  /**< data for compressed frame packet */
-      vpx_fixed_buf_t twopass_stats;  /**< data for two-pass packet */
-      vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
-      struct vpx_psnr_pkt {
-        unsigned int samples[4];  /**< Number of samples, total/y/u/v */
-        uint64_t     sse[4];      /**< sum squared error, total/y/u/v */
-        double       psnr[4];     /**< PSNR, total/y/u/v */
-      } psnr;                       /**< data for PSNR packet */
-      vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
-
-      /* This packet size is fixed to allow codecs to extend this
-       * interface without having to manage storage for raw packets,
-       * i.e., if it's smaller than 128 bytes, you can store in the
-       * packet list directly.
-       */
-      char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */
-    } data; /**< packet data */
-  } vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */
-
-  /*!\brief Rational Number
-   *
-   * This structure holds a fractional value.
-   */
-  typedef struct vpx_rational {
-    int num; /**< fraction numerator */
-    int den; /**< fraction denominator */
-  } vpx_rational_t; /**< alias for struct vpx_rational */
-
-
-  /*!\brief Multi-pass Encoding Pass */
-  enum vpx_enc_pass {
-    VPX_RC_ONE_PASS,   /**< Single pass mode */
-    VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */
-    VPX_RC_LAST_PASS   /**< Final pass of multi-pass mode */
-  };
-
-
-  /*!\brief Rate control mode */
-  enum vpx_rc_mode {
-    VPX_VBR,  /**< Variable Bit Rate (VBR) mode */
-    VPX_CBR,  /**< Constant Bit Rate (CBR) mode */
-    VPX_CQ,   /**< Constrained Quality (CQ)  mode */
-    VPX_Q,    /**< Constant Quality (Q) mode */
-  };
-
-
-  /*!\brief Keyframe placement mode.
-   *
-   * This enumeration determines whether keyframes are placed automatically by
-   * the encoder or whether this behavior is disabled. Older releases of this
-   * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled.
-   * This name is confusing for this behavior, so the new symbols to be used
-   * are VPX_KF_AUTO and VPX_KF_DISABLED.
-   */
-  enum vpx_kf_mode {
-    VPX_KF_FIXED, /**< deprecated, implies VPX_KF_DISABLED */
-    VPX_KF_AUTO,  /**< Encoder determines optimal placement automatically */
-    VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
-  };
-
-
-  /*!\brief Encoded Frame Flags
-   *
-   * This type indicates a bitfield to be passed to vpx_codec_encode(), defining
-   * per-frame boolean values. By convention, bits common to all codecs will be
-   * named VPX_EFLAG_*, and bits specific to an algorithm will be named
-   * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
-   */
-  typedef long vpx_enc_frame_flags_t;
-#define VPX_EFLAG_FORCE_KF (1<<0)  /**< Force this frame to be a keyframe */
-
-
-  /*!\brief Encoder configuration structure
-   *
-   * This structure contains the encoder settings that have common representations
-   * across all codecs. This doesn't imply that all codecs support all features,
-   * however.
-   */
-  typedef struct vpx_codec_enc_cfg {
-    /*
-     * generic settings (g)
-     */
-
-    /*!\brief Algorithm specific "usage" value
-     *
-     * Algorithms may define multiple values for usage, which may convey the
-     * intent of how the application intends to use the stream. If this value
-     * is non-zero, consult the documentation for the codec to determine its
-     * meaning.
-     */
-    unsigned int           g_usage;
-
-
-    /*!\brief Maximum number of threads to use
-     *
-     * For multi-threaded implementations, use no more than this number of
-     * threads. The codec may use fewer threads than allowed. The value
-     * 0 is equivalent to the value 1.
-     */
-    unsigned int           g_threads;
-
-
-    /*!\brief Bitstream profile to use
-     *
-     * Some codecs support a notion of multiple bitstream profiles. Typically
-     * this maps to a set of features that are turned on or off. Often the
-     * profile to use is determined by the features of the intended decoder.
-     * Consult the documentation for the codec to determine the valid values
-     * for this parameter, or set to zero for a sane default.
-     */
-    unsigned int           g_profile;  /**< profile of bitstream to use */
-
-
-
-    /*!\brief Width of the frame
-     *
-     * This value identifies the presentation resolution of the frame,
-     * in pixels. Note that the frames passed as input to the encoder must
-     * have this resolution. Frames will be presented by the decoder in this
-     * resolution, independent of any spatial resampling the encoder may do.
-     */
-    unsigned int           g_w;
-
-
-    /*!\brief Height of the frame
-     *
-     * This value identifies the presentation resolution of the frame,
-     * in pixels. Note that the frames passed as input to the encoder must
-     * have this resolution. Frames will be presented by the decoder in this
-     * resolution, independent of any spatial resampling the encoder may do.
-     */
-    unsigned int           g_h;
-
-    /*!\brief Bit-depth of the codec
-     *
-     * This value identifies the bit_depth of the codec,
-     * Only certain bit-depths are supported as identified in the
-     * vpx_bit_depth_t enum.
-     */
-    vpx_bit_depth_t        g_bit_depth;
-
-    /*!\brief Bit-depth of the input frames
-     *
-     * This value identifies the bit_depth of the input frames in bits.
-     * Note that the frames passed as input to the encoder must have
-     * this bit-depth.
-     */
-    unsigned int           g_input_bit_depth;
-
-    /*!\brief Stream timebase units
-     *
-     * Indicates the smallest interval of time, in seconds, used by the stream.
-     * For fixed frame rate material, or variable frame rate material where
-     * frames are timed at a multiple of a given clock (ex: video capture),
-     * the \ref RECOMMENDED method is to set the timebase to the reciprocal
-     * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
-     * pts to correspond to the frame number, which can be handy. For
-     * re-encoding video from containers with absolute time timestamps, the
-     * \ref RECOMMENDED method is to set the timebase to that of the parent
-     * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
-     */
-    struct vpx_rational    g_timebase;
-
-
-    /*!\brief Enable error resilient modes.
-     *
-     * The error resilient bitfield indicates to the encoder which features
-     * it should enable to take measures for streaming over lossy or noisy
-     * links.
-     */
-    vpx_codec_er_flags_t   g_error_resilient;
-
-
-    /*!\brief Multi-pass Encoding Mode
-     *
-     * This value should be set to the current phase for multi-pass encoding.
-     * For single pass, set to #VPX_RC_ONE_PASS.
-     */
-    enum vpx_enc_pass      g_pass;
-
-
-    /*!\brief Allow lagged encoding
-     *
-     * If set, this value allows the encoder to consume a number of input
-     * frames before producing output frames. This allows the encoder to
-     * base decisions for the current frame on future frames. This does
-     * increase the latency of the encoding pipeline, so it is not appropriate
-     * in all situations (ex: realtime encoding).
-     *
-     * Note that this is a maximum value -- the encoder may produce frames
-     * sooner than the given limit. Set this value to 0 to disable this
-     * feature.
-     */
-    unsigned int           g_lag_in_frames;
-
-
-    /*
-     * rate control settings (rc)
-     */
-
-    /*!\brief Temporal resampling configuration, if supported by the codec.
-     *
-     * Temporal resampling allows the codec to "drop" frames as a strategy to
-     * meet its target data rate. This can cause temporal discontinuities in
-     * the encoded video, which may appear as stuttering during playback. This
-     * trade-off is often acceptable, but for many applications is not. It can
-     * be disabled in these cases.
-     *
-     * Note that not all codecs support this feature. All vpx VPx codecs do.
-     * For other codecs, consult the documentation for that algorithm.
-     *
-     * This threshold is described as a percentage of the target data buffer.
-     * When the data buffer falls below this percentage of fullness, a
-     * dropped frame is indicated. Set the threshold to zero (0) to disable
-     * this feature.
-     */
-    unsigned int           rc_dropframe_thresh;
-
-
-    /*!\brief Enable/disable spatial resampling, if supported by the codec.
-     *
-     * Spatial resampling allows the codec to compress a lower resolution
-     * version of the frame, which is then upscaled by the encoder to the
-     * correct presentation resolution. This increases visual quality at
-     * low data rates, at the expense of CPU time on the encoder/decoder.
-     */
-    unsigned int           rc_resize_allowed;
-
-    /*!\brief Internal coded frame width.
-     *
-     * If spatial resampling is enabled this specifies the width of the
-     * encoded frame.
-     */
-    unsigned int           rc_scaled_width;
-
-    /*!\brief Internal coded frame height.
-     *
-     * If spatial resampling is enabled this specifies the height of the
-     * encoded frame.
-     */
-    unsigned int           rc_scaled_height;
-
-    /*!\brief Spatial resampling up watermark.
-     *
-     * This threshold is described as a percentage of the target data buffer.
-     * When the data buffer rises above this percentage of fullness, the
-     * encoder will step up to a higher resolution version of the frame.
-     */
-    unsigned int           rc_resize_up_thresh;
-
-
-    /*!\brief Spatial resampling down watermark.
-     *
-     * This threshold is described as a percentage of the target data buffer.
-     * When the data buffer falls below this percentage of fullness, the
-     * encoder will step down to a lower resolution version of the frame.
-     */
-    unsigned int           rc_resize_down_thresh;
-
-
-    /*!\brief Rate control algorithm to use.
-     *
-     * Indicates whether the end usage of this stream is to be streamed over
-     * a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
-     * mode should be used, or whether it will be played back on a high
-     * bandwidth link, as from a local disk, where higher variations in
-     * bitrate are acceptable.
-     */
-    enum vpx_rc_mode       rc_end_usage;
-
-
-    /*!\brief Two-pass stats buffer.
-     *
-     * A buffer containing all of the stats packets produced in the first
-     * pass, concatenated.
-     */
-    vpx_fixed_buf_t   rc_twopass_stats_in;
-
-    /*!\brief first pass mb stats buffer.
-     *
-     * A buffer containing all of the first pass mb stats packets produced
-     * in the first pass, concatenated.
-     */
-    vpx_fixed_buf_t   rc_firstpass_mb_stats_in;
-
-    /*!\brief Target data rate
-     *
-     * Target bandwidth to use for this stream, in kilobits per second.
-     */
-    unsigned int           rc_target_bitrate;
-
-
-    /*
-     * quantizer settings
-     */
-
-
-    /*!\brief Minimum (Best Quality) Quantizer
-     *
-     * The quantizer is the most direct control over the quality of the
-     * encoded image. The range of valid values for the quantizer is codec
-     * specific. Consult the documentation for the codec to determine the
-     * values to use. To determine the range programmatically, call
-     * vpx_codec_enc_config_default() with a usage value of 0.
-     */
-    unsigned int           rc_min_quantizer;
-
-
-    /*!\brief Maximum (Worst Quality) Quantizer
-     *
-     * The quantizer is the most direct control over the quality of the
-     * encoded image. The range of valid values for the quantizer is codec
-     * specific. Consult the documentation for the codec to determine the
-     * values to use. To determine the range programmatically, call
-     * vpx_codec_enc_config_default() with a usage value of 0.
-     */
-    unsigned int           rc_max_quantizer;
-
-
-    /*
-     * bitrate tolerance
-     */
-
-
-    /*!\brief Rate control adaptation undershoot control
-     *
-     * This value, expressed as a percentage of the target bitrate,
-     * controls the maximum allowed adaptation speed of the codec.
-     * This factor controls the maximum amount of bits that can
-     * be subtracted from the target bitrate in order to compensate
-     * for prior overshoot.
-     *
-     * Valid values in the range 0-1000.
-     */
-    unsigned int           rc_undershoot_pct;
-
-
-    /*!\brief Rate control adaptation overshoot control
-     *
-     * This value, expressed as a percentage of the target bitrate,
-     * controls the maximum allowed adaptation speed of the codec.
-     * This factor controls the maximum amount of bits that can
-     * be added to the target bitrate in order to compensate for
-     * prior undershoot.
-     *
-     * Valid values in the range 0-1000.
-     */
-    unsigned int           rc_overshoot_pct;
-
-
-    /*
-     * decoder buffer model parameters
-     */
-
-
-    /*!\brief Decoder Buffer Size
-     *
-     * This value indicates the amount of data that may be buffered by the
-     * decoding application. Note that this value is expressed in units of
-     * time (milliseconds). For example, a value of 5000 indicates that the
-     * client will buffer (at least) 5000ms worth of encoded data. Use the
-     * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
-     * necessary.
-     */
-    unsigned int           rc_buf_sz;
-
-
-    /*!\brief Decoder Buffer Initial Size
-     *
-     * This value indicates the amount of data that will be buffered by the
-     * decoding application prior to beginning playback. This value is
-     * expressed in units of time (milliseconds). Use the target bitrate
-     * (#rc_target_bitrate) to convert to bits/bytes, if necessary.
-     */
-    unsigned int           rc_buf_initial_sz;
-
-
-    /*!\brief Decoder Buffer Optimal Size
-     *
-     * This value indicates the amount of data that the encoder should try
-     * to maintain in the decoder's buffer. This value is expressed in units
-     * of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
-     * to convert to bits/bytes, if necessary.
-     */
-    unsigned int           rc_buf_optimal_sz;
-
+/*!\brief Time Stamp Type
+ *
+ * An integer, which when multiplied by the stream's time base, provides
+ * the absolute time of a sample.
+ */
+typedef int64_t vpx_codec_pts_t;
 
-    /*
-     * 2 pass rate control parameters
-     */
+/*!\brief Compressed Frame Flags
+ *
+ * This type represents a bitfield containing information about a compressed
+ * frame that may be useful to an application. The most significant 16 bits
+ * can be used by an algorithm to provide additional detail, for example to
+ * support frame types that are codec specific (MPEG-1 D-frames for example)
+ */
+typedef uint32_t vpx_codec_frame_flags_t;
+#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+/*!\brief frame can be dropped without affecting the stream (no future frame
+ * depends on this one) */
+#define VPX_FRAME_IS_DROPPABLE 0x2
+/*!\brief frame should be decoded but will not be shown */
+#define VPX_FRAME_IS_INVISIBLE 0x4
+/*!\brief this is a fragment of the encoded frame */
+#define VPX_FRAME_IS_FRAGMENT 0x8
+
+/*!\brief Error Resilient flags
+ *
+ * These flags define which error resilient features to enable in the
+ * encoder. The flags are specified through the
+ * vpx_codec_enc_cfg::g_error_resilient variable.
+ */
+typedef uint32_t vpx_codec_er_flags_t;
+/*!\brief Improve resiliency against losses of whole frames */
+#define VPX_ERROR_RESILIENT_DEFAULT 0x1
+/*!\brief The frame partitions are independently decodable by the bool decoder,
+ * meaning that partitions can be decoded even though earlier partitions have
+ * been lost. Note that intra prediction is still done over the partition
+ * boundary. */
+#define VPX_ERROR_RESILIENT_PARTITIONS 0x2
+
+/*!\brief Encoder output packet variants
+ *
+ * This enumeration lists the different kinds of data packets that can be
+ * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY
+ * extend this list to provide additional functionality.
+ */
+enum vpx_codec_cx_pkt_kind {
+  VPX_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
+  VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
+  VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
+  VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
+  VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
+};
+
+/*!\brief Encoder output packet
+ *
+ * This structure contains the different kinds of output data the encoder
+ * may produce while compressing a frame.
+ */
+typedef struct vpx_codec_cx_pkt {
+  enum vpx_codec_cx_pkt_kind kind; /**< packet variant */
+  union {
+    struct {
+      void *buf; /**< compressed data buffer */
+      size_t sz; /**< length of compressed data */
+      /*!\brief time stamp to show frame (in timebase units) */
+      vpx_codec_pts_t pts;
+      /*!\brief duration to show frame (in timebase units) */
+      unsigned long duration;
+      vpx_codec_frame_flags_t flags; /**< flags for this frame */
+      /*!\brief the partition id defines the decoding order of the partitions.
+       * Only applicable when "output partition" mode is enabled. First
+       * partition has id 0.*/
+      int partition_id;
+    } frame;                            /**< data for compressed frame packet */
+    vpx_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
+    vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
+    struct vpx_psnr_pkt {
+      unsigned int samples[4]; /**< Number of samples, total/y/u/v */
+      uint64_t sse[4];         /**< sum squared error, total/y/u/v */
+      double psnr[4];          /**< PSNR, total/y/u/v */
+    } psnr;                    /**< data for PSNR packet */
+    vpx_fixed_buf_t raw;       /**< data for arbitrary packets */
+
+    /* This packet size is fixed to allow codecs to extend this
+     * interface without having to manage storage for raw packets,
+     * i.e., if it's smaller than 128 bytes, you can store in the
+     * packet list directly.
+     */
+    char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */
+  } data;                                               /**< packet data */
+} vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */
+
+/*!\brief Rational Number
+ *
+ * This structure holds a fractional value.
+ */
+typedef struct vpx_rational {
+  int num;        /**< fraction numerator */
+  int den;        /**< fraction denominator */
+} vpx_rational_t; /**< alias for struct vpx_rational */
+
+/*!\brief Multi-pass Encoding Pass */
+enum vpx_enc_pass {
+  VPX_RC_ONE_PASS,   /**< Single pass mode */
+  VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */
+  VPX_RC_LAST_PASS   /**< Final pass of multi-pass mode */
+};
+
+/*!\brief Rate control mode */
+enum vpx_rc_mode {
+  VPX_VBR, /**< Variable Bit Rate (VBR) mode */
+  VPX_CBR, /**< Constant Bit Rate (CBR) mode */
+  VPX_CQ,  /**< Constrained Quality (CQ)  mode */
+  VPX_Q,   /**< Constant Quality (Q) mode */
+};
+
+/*!\brief Keyframe placement mode.
+ *
+ * This enumeration determines whether keyframes are placed automatically by
+ * the encoder or whether this behavior is disabled. Older releases of this
+ * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled.
+ * This name is confusing for this behavior, so the new symbols to be used
+ * are VPX_KF_AUTO and VPX_KF_DISABLED.
+ */
+enum vpx_kf_mode {
+  VPX_KF_FIXED,       /**< deprecated, implies VPX_KF_DISABLED */
+  VPX_KF_AUTO,        /**< Encoder determines optimal placement automatically */
+  VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
+};
 
+/*!\brief Encoded Frame Flags
+ *
+ * This type indicates a bitfield to be passed to vpx_codec_encode(), defining
+ * per-frame boolean values. By convention, bits common to all codecs will be
+ * named VPX_EFLAG_*, and bits specific to an algorithm will be named
+ * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
+ */
+typedef long vpx_enc_frame_flags_t;
+#define VPX_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
 
-    /*!\brief Two-pass mode CBR/VBR bias
-     *
-     * Bias, expressed on a scale of 0 to 100, for determining target size
-     * for the current frame. The value 0 indicates the optimal CBR mode
-     * value should be used. The value 100 indicates the optimal VBR mode
-     * value should be used. Values in between indicate which way the
-     * encoder should "lean."
-     */
-    unsigned int           rc_2pass_vbr_bias_pct;       /**< RC mode bias between CBR and VBR(0-100: 0->CBR, 100->VBR)   */
+/*!\brief Encoder configuration structure
+ *
+ * This structure contains the encoder settings that have common representations
+ * across all codecs. This doesn't imply that all codecs support all features,
+ * however.
+ */
+typedef struct vpx_codec_enc_cfg {
+  /*
+   * generic settings (g)
+   */
 
+  /*!\brief Algorithm specific "usage" value
+   *
+   * Algorithms may define multiple values for usage, which may convey the
+   * intent of how the application intends to use the stream. If this value
+   * is non-zero, consult the documentation for the codec to determine its
+   * meaning.
+   */
+  unsigned int g_usage;
 
-    /*!\brief Two-pass mode per-GOP minimum bitrate
-     *
-     * This value, expressed as a percentage of the target bitrate, indicates
-     * the minimum bitrate to be used for a single GOP (aka "section")
-     */
-    unsigned int           rc_2pass_vbr_minsection_pct;
+  /*!\brief Maximum number of threads to use
+   *
+   * For multi-threaded implementations, use no more than this number of
+   * threads. The codec may use fewer threads than allowed. The value
+   * 0 is equivalent to the value 1.
+   */
+  unsigned int g_threads;
 
+  /*!\brief Bitstream profile to use
+   *
+   * Some codecs support a notion of multiple bitstream profiles. Typically
+   * this maps to a set of features that are turned on or off. Often the
+   * profile to use is determined by the features of the intended decoder.
+   * Consult the documentation for the codec to determine the valid values
+   * for this parameter, or set to zero for a sane default.
+   */
+  unsigned int g_profile; /**< profile of bitstream to use */
 
-    /*!\brief Two-pass mode per-GOP maximum bitrate
-     *
-     * This value, expressed as a percentage of the target bitrate, indicates
-     * the maximum bitrate to be used for a single GOP (aka "section")
-     */
-    unsigned int           rc_2pass_vbr_maxsection_pct;
+  /*!\brief Width of the frame
+   *
+   * This value identifies the presentation resolution of the frame,
+   * in pixels. Note that the frames passed as input to the encoder must
+   * have this resolution. Frames will be presented by the decoder in this
+   * resolution, independent of any spatial resampling the encoder may do.
+   */
+  unsigned int g_w;
 
+  /*!\brief Height of the frame
+   *
+   * This value identifies the presentation resolution of the frame,
+   * in pixels. Note that the frames passed as input to the encoder must
+   * have this resolution. Frames will be presented by the decoder in this
+   * resolution, independent of any spatial resampling the encoder may do.
+   */
+  unsigned int g_h;
 
-    /*
-     * keyframing settings (kf)
-     */
+  /*!\brief Bit-depth of the codec
+   *
+   * This value identifies the bit_depth of the codec,
+   * Only certain bit-depths are supported as identified in the
+   * vpx_bit_depth_t enum.
+   */
+  vpx_bit_depth_t g_bit_depth;
 
-    /*!\brief Keyframe placement mode
-     *
-     * This value indicates whether the encoder should place keyframes at a
-     * fixed interval, or determine the optimal placement automatically
-     * (as governed by the #kf_min_dist and #kf_max_dist parameters)
-     */
-    enum vpx_kf_mode       kf_mode;
+  /*!\brief Bit-depth of the input frames
+   *
+   * This value identifies the bit_depth of the input frames in bits.
+   * Note that the frames passed as input to the encoder must have
+   * this bit-depth.
+   */
+  unsigned int g_input_bit_depth;
+
+  /*!\brief Stream timebase units
+   *
+   * Indicates the smallest interval of time, in seconds, used by the stream.
+   * For fixed frame rate material, or variable frame rate material where
+   * frames are timed at a multiple of a given clock (ex: video capture),
+   * the \ref RECOMMENDED method is to set the timebase to the reciprocal
+   * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
+   * pts to correspond to the frame number, which can be handy. For
+   * re-encoding video from containers with absolute time timestamps, the
+   * \ref RECOMMENDED method is to set the timebase to that of the parent
+   * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
+   */
+  struct vpx_rational g_timebase;
 
+  /*!\brief Enable error resilient modes.
+   *
+   * The error resilient bitfield indicates to the encoder which features
+   * it should enable to take measures for streaming over lossy or noisy
+   * links.
+   */
+  vpx_codec_er_flags_t g_error_resilient;
 
-    /*!\brief Keyframe minimum interval
-     *
-     * This value, expressed as a number of frames, prevents the encoder from
-     * placing a keyframe nearer than kf_min_dist to the previous keyframe. At
-     * least kf_min_dist frames non-keyframes will be coded before the next
-     * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
-     */
-    unsigned int           kf_min_dist;
+  /*!\brief Multi-pass Encoding Mode
+   *
+   * This value should be set to the current phase for multi-pass encoding.
+   * For single pass, set to #VPX_RC_ONE_PASS.
+   */
+  enum vpx_enc_pass g_pass;
 
+  /*!\brief Allow lagged encoding
+   *
+   * If set, this value allows the encoder to consume a number of input
+   * frames before producing output frames. This allows the encoder to
+   * base decisions for the current frame on future frames. This does
+   * increase the latency of the encoding pipeline, so it is not appropriate
+   * in all situations (ex: realtime encoding).
+   *
+   * Note that this is a maximum value -- the encoder may produce frames
+   * sooner than the given limit. Set this value to 0 to disable this
+   * feature.
+   */
+  unsigned int g_lag_in_frames;
 
-    /*!\brief Keyframe maximum interval
-     *
-     * This value, expressed as a number of frames, forces the encoder to code
-     * a keyframe if one has not been coded in the last kf_max_dist frames.
-     * A value of 0 implies all frames will be keyframes. Set kf_min_dist
-     * equal to kf_max_dist for a fixed interval.
-     */
-    unsigned int           kf_max_dist;
-  } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
+  /*
+   * rate control settings (rc)
+   */
 
-  /*!\brief Initialize an encoder instance
+  /*!\brief Temporal resampling configuration, if supported by the codec.
    *
-   * Initializes a encoder context using the given interface. Applications
-   * should call the vpx_codec_enc_init convenience macro instead of this
-   * function directly, to ensure that the ABI version number parameter
-   * is properly initialized.
+   * Temporal resampling allows the codec to "drop" frames as a strategy to
+   * meet its target data rate. This can cause temporal discontinuities in
+   * the encoded video, which may appear as stuttering during playback. This
+   * trade-off is often acceptable, but for many applications is not. It can
+   * be disabled in these cases.
    *
-   * If the library was configured with --disable-multithread, this call
-   * is not thread safe and should be guarded with a lock if being used
-   * in a multithreaded context.
+   * Note that not all codecs support this feature. All vpx VPx codecs do.
+   * For other codecs, consult the documentation for that algorithm.
    *
-   * \param[in]    ctx     Pointer to this instance's context.
-   * \param[in]    iface   Pointer to the algorithm interface to use.
-   * \param[in]    cfg     Configuration to use, if known. May be NULL.
-   * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
-   * \param[in]    ver     ABI version number. Must be set to
-   *                       VPX_ENCODER_ABI_VERSION
-   * \retval #VPX_CODEC_OK
-   *     The decoder algorithm initialized.
-   * \retval #VPX_CODEC_MEM_ERROR
-   *     Memory allocation failed.
+   * This threshold is described as a percentage of the target data buffer.
+   * When the data buffer falls below this percentage of fullness, a
+   * dropped frame is indicated. Set the threshold to zero (0) to disable
+   * this feature.
    */
-  vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
-                                         vpx_codec_iface_t    *iface,
-                                         const vpx_codec_enc_cfg_t *cfg,
-                                         vpx_codec_flags_t     flags,
-                                         int                   ver);
+  unsigned int rc_dropframe_thresh;
 
-
-  /*!\brief Convenience macro for vpx_codec_enc_init_ver()
+  /*!\brief Enable/disable spatial resampling, if supported by the codec.
    *
-   * Ensures the ABI version parameter is properly set.
+   * Spatial resampling allows the codec to compress a lower resolution
+   * version of the frame, which is then upscaled by the encoder to the
+   * correct presentation resolution. This increases visual quality at
+   * low data rates, at the expense of CPU time on the encoder/decoder.
    */
-#define vpx_codec_enc_init(ctx, iface, cfg, flags) \
-  vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION)
-
+  unsigned int rc_resize_allowed;
 
-  /*!\brief Initialize multi-encoder instance
+  /*!\brief Internal coded frame width.
    *
-   * Initializes multi-encoder context using the given interface.
-   * Applications should call the vpx_codec_enc_init_multi convenience macro
-   * instead of this function directly, to ensure that the ABI version number
-   * parameter is properly initialized.
-   *
-   * \param[in]    ctx     Pointer to this instance's context.
-   * \param[in]    iface   Pointer to the algorithm interface to use.
-   * \param[in]    cfg     Configuration to use, if known. May be NULL.
-   * \param[in]    num_enc Total number of encoders.
-   * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
-   * \param[in]    dsf     Pointer to down-sampling factors.
-   * \param[in]    ver     ABI version number. Must be set to
-   *                       VPX_ENCODER_ABI_VERSION
-   * \retval #VPX_CODEC_OK
-   *     The decoder algorithm initialized.
-   * \retval #VPX_CODEC_MEM_ERROR
-   *     Memory allocation failed.
+   * If spatial resampling is enabled this specifies the width of the
+   * encoded frame.
    */
-  vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
-                                               vpx_codec_iface_t    *iface,
-                                               vpx_codec_enc_cfg_t  *cfg,
-                                               int                   num_enc,
-                                               vpx_codec_flags_t     flags,
-                                               vpx_rational_t       *dsf,
-                                               int                   ver);
-
+  unsigned int rc_scaled_width;
 
-  /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver()
+  /*!\brief Internal coded frame height.
    *
-   * Ensures the ABI version parameter is properly set.
+   * If spatial resampling is enabled this specifies the height of the
+   * encoded frame.
    */
-#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
-  vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \
-                               VPX_ENCODER_ABI_VERSION)
+  unsigned int rc_scaled_height;
 
-
-  /*!\brief Get a default configuration
-   *
-   * Initializes a encoder configuration structure with default values. Supports
-   * the notion of "usages" so that an algorithm may offer different default
-   * settings depending on the user's intended goal. This function \ref SHOULD
-   * be called by all applications to initialize the configuration structure
-   * before specializing the configuration with application specific values.
+  /*!\brief Spatial resampling up watermark.
    *
-   * \param[in]    iface     Pointer to the algorithm interface to use.
-   * \param[out]   cfg       Configuration buffer to populate.
-   * \param[in]    reserved  Must set to 0 for VP8 and VP9.
+   * This threshold is described as a percentage of the target data buffer.
+   * When the data buffer rises above this percentage of fullness, the
+   * encoder will step up to a higher resolution version of the frame.
+   */
+  unsigned int rc_resize_up_thresh;
+
+  /*!\brief Spatial resampling down watermark.
    *
-   * \retval #VPX_CODEC_OK
-   *     The configuration was populated.
-   * \retval #VPX_CODEC_INCAPABLE
-   *     Interface is not an encoder interface.
-   * \retval #VPX_CODEC_INVALID_PARAM
-   *     A parameter was NULL, or the usage value was not recognized.
-   */
-  vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
-                                                vpx_codec_enc_cfg_t  *cfg,
-                                                unsigned int          reserved);
-
-
-  /*!\brief Set or change configuration
-   *
-   * Reconfigures an encoder instance according to the given configuration.
-   *
-   * \param[in]    ctx     Pointer to this instance's context
-   * \param[in]    cfg     Configuration buffer to use
-   *
-   * \retval #VPX_CODEC_OK
-   *     The configuration was populated.
-   * \retval #VPX_CODEC_INCAPABLE
-   *     Interface is not an encoder interface.
-   * \retval #VPX_CODEC_INVALID_PARAM
-   *     A parameter was NULL, or the usage value was not recognized.
-   */
-  vpx_codec_err_t  vpx_codec_enc_config_set(vpx_codec_ctx_t            *ctx,
-                                            const vpx_codec_enc_cfg_t  *cfg);
-
-
-  /*!\brief Get global stream headers
-   *
-   * Retrieves a stream level global header packet, if supported by the codec.
-   *
-   * \param[in]    ctx     Pointer to this instance's context
-   *
-   * \retval NULL
-   *     Encoder does not support global header
-   * \retval Non-NULL
-   *     Pointer to buffer containing global header packet
-   */
-  vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t   *ctx);
-
-
-#define VPX_DL_REALTIME     (1)        /**< deadline parameter analogous to
-  *   VPx REALTIME mode. */
-#define VPX_DL_GOOD_QUALITY (1000000)  /**< deadline parameter analogous to
-  *   VPx GOOD QUALITY mode. */
-#define VPX_DL_BEST_QUALITY (0)        /**< deadline parameter analogous to
-  *   VPx BEST QUALITY mode. */
-  /*!\brief Encode a frame
-   *
-   * Encodes a video frame at the given "presentation time." The presentation
-   * time stamp (PTS) \ref MUST be strictly increasing.
-   *
-   * The encoder supports the notion of a soft real-time deadline. Given a
-   * non-zero value to the deadline parameter, the encoder will make a "best
-   * effort" guarantee to  return before the given time slice expires. It is
-   * implicit that limiting the available time to encode will degrade the
-   * output quality. The encoder can be given an unlimited time to produce the
-   * best possible frame by specifying a deadline of '0'. This deadline
-   * supercedes the VPx notion of "best quality, good quality, realtime".
-   * Applications that wish to map these former settings to the new deadline
-   * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY,
-   * and #VPX_DL_BEST_QUALITY.
+   * This threshold is described as a percentage of the target data buffer.
+   * When the data buffer falls below this percentage of fullness, the
+   * encoder will step down to a lower resolution version of the frame.
+   */
+  unsigned int rc_resize_down_thresh;
+
+  /*!\brief Rate control algorithm to use.
    *
-   * When the last frame has been passed to the encoder, this function should
-   * continue to be called, with the img parameter set to NULL. This will
-   * signal the end-of-stream condition to the encoder and allow it to encode
-   * any held buffers. Encoding is complete when vpx_codec_encode() is called
-   * and vpx_codec_get_cx_data() returns no data.
+   * Indicates whether the end usage of this stream is to be streamed over
+   * a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
+   * mode should be used, or whether it will be played back on a high
+   * bandwidth link, as from a local disk, where higher variations in
+   * bitrate are acceptable.
+   */
+  enum vpx_rc_mode rc_end_usage;
+
+  /*!\brief Two-pass stats buffer.
    *
-   * \param[in]    ctx       Pointer to this instance's context
-   * \param[in]    img       Image data to encode, NULL to flush.
-   * \param[in]    pts       Presentation time stamp, in timebase units.
-   * \param[in]    duration  Duration to show frame, in timebase units.
-   * \param[in]    flags     Flags to use for encoding this frame.
-   * \param[in]    deadline  Time to spend encoding, in microseconds. (0=infinite)
+   * A buffer containing all of the stats packets produced in the first
+   * pass, concatenated.
+   */
+  vpx_fixed_buf_t rc_twopass_stats_in;
+
+  /*!\brief first pass mb stats buffer.
    *
-   * \retval #VPX_CODEC_OK
-   *     The configuration was populated.
-   * \retval #VPX_CODEC_INCAPABLE
-   *     Interface is not an encoder interface.
-   * \retval #VPX_CODEC_INVALID_PARAM
-   *     A parameter was NULL, the image format is unsupported, etc.
+   * A buffer containing all of the first pass mb stats packets produced
+   * in the first pass, concatenated.
    */
-  vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
-                                    const vpx_image_t          *img,
-                                    vpx_codec_pts_t             pts,
-                                    unsigned long               duration,
-                                    vpx_enc_frame_flags_t       flags,
-                                    unsigned long               deadline);
+  vpx_fixed_buf_t rc_firstpass_mb_stats_in;
 
-  /*!\brief Set compressed data output buffer
+  /*!\brief Target data rate
    *
-   * Sets the buffer that the codec should output the compressed data
-   * into. This call effectively sets the buffer pointer returned in the
-   * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
-   * appended into this buffer. The buffer is preserved across frames,
-   * so applications must periodically call this function after flushing
-   * the accumulated compressed data to disk or to the network to reset
-   * the pointer to the buffer's head.
+   * Target bandwidth to use for this stream, in kilobits per second.
+   */
+  unsigned int rc_target_bitrate;
+
+  /*
+   * quantizer settings
+   */
+
+  /*!\brief Minimum (Best Quality) Quantizer
    *
-   * `pad_before` bytes will be skipped before writing the compressed
-   * data, and `pad_after` bytes will be appended to the packet. The size
-   * of the packet will be the sum of the size of the actual compressed
-   * data, pad_before, and pad_after. The padding bytes will be preserved
-   * (not overwritten).
+   * The quantizer is the most direct control over the quality of the
+   * encoded image. The range of valid values for the quantizer is codec
+   * specific. Consult the documentation for the codec to determine the
+   * values to use. To determine the range programmatically, call
+   * vpx_codec_enc_config_default() with a usage value of 0.
+   */
+  unsigned int rc_min_quantizer;
+
+  /*!\brief Maximum (Worst Quality) Quantizer
    *
-   * Note that calling this function does not guarantee that the returned
-   * compressed data will be placed into the specified buffer. In the
-   * event that the encoded data will not fit into the buffer provided,
-   * the returned packet \ref MAY point to an internal buffer, as it would
-   * if this call were never used. In this event, the output packet will
-   * NOT have any padding, and the application must free space and copy it
-   * to the proper place. This is of particular note in configurations
-   * that may output multiple packets for a single encoded frame (e.g., lagged
-   * encoding) or if the application does not reset the buffer periodically.
+   * The quantizer is the most direct control over the quality of the
+   * encoded image. The range of valid values for the quantizer is codec
+   * specific. Consult the documentation for the codec to determine the
+   * values to use. To determine the range programmatically, call
+   * vpx_codec_enc_config_default() with a usage value of 0.
+   */
+  unsigned int rc_max_quantizer;
+
+  /*
+   * bitrate tolerance
+   */
+
+  /*!\brief Rate control adaptation undershoot control
    *
-   * Applications may restore the default behavior of the codec providing
-   * the compressed data buffer by calling this function with a NULL
-   * buffer.
+   * This value, expressed as a percentage of the target bitrate,
+   * controls the maximum allowed adaptation speed of the codec.
+   * This factor controls the maximum amount of bits that can
+   * be subtracted from the target bitrate in order to compensate
+   * for prior overshoot.
    *
-   * Applications \ref MUSTNOT call this function during iteration of
-   * vpx_codec_get_cx_data().
+   * Valid values in the range 0-1000.
+   */
+  unsigned int rc_undershoot_pct;
+
+  /*!\brief Rate control adaptation overshoot control
    *
-   * \param[in]    ctx         Pointer to this instance's context
-   * \param[in]    buf         Buffer to store compressed data into
-   * \param[in]    pad_before  Bytes to skip before writing compressed data
-   * \param[in]    pad_after   Bytes to skip after writing compressed data
+   * This value, expressed as a percentage of the target bitrate,
+   * controls the maximum allowed adaptation speed of the codec.
+   * This factor controls the maximum amount of bits that can
+   * be added to the target bitrate in order to compensate for
+   * prior undershoot.
    *
-   * \retval #VPX_CODEC_OK
-   *     The buffer was set successfully.
-   * \retval #VPX_CODEC_INVALID_PARAM
-   *     A parameter was NULL, the image format is unsupported, etc.
+   * Valid values in the range 0-1000.
    */
-  vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t       *ctx,
-                                            const vpx_fixed_buf_t *buf,
-                                            unsigned int           pad_before,
-                                            unsigned int           pad_after);
+  unsigned int rc_overshoot_pct;
 
+  /*
+   * decoder buffer model parameters
+   */
 
-  /*!\brief Encoded data iterator
-   *
-   * Iterates over a list of data packets to be passed from the encoder to the
-   * application. The different kinds of packets available are enumerated in
-   * #vpx_codec_cx_pkt_kind.
+  /*!\brief Decoder Buffer Size
    *
-   * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's
-   * muxer. Multiple compressed frames may be in the list.
-   * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer.
+   * This value indicates the amount of data that may be buffered by the
+   * decoding application. Note that this value is expressed in units of
+   * time (milliseconds). For example, a value of 5000 indicates that the
+   * client will buffer (at least) 5000ms worth of encoded data. Use the
+   * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
+   * necessary.
+   */
+  unsigned int rc_buf_sz;
+
+  /*!\brief Decoder Buffer Initial Size
    *
-   * The application \ref MUST silently ignore any packet kinds that it does
-   * not recognize or support.
+   * This value indicates the amount of data that will be buffered by the
+   * decoding application prior to beginning playback. This value is
+   * expressed in units of time (milliseconds). Use the target bitrate
+   * (#rc_target_bitrate) to convert to bits/bytes, if necessary.
+   */
+  unsigned int rc_buf_initial_sz;
+
+  /*!\brief Decoder Buffer Optimal Size
    *
-   * The data buffers returned from this function are only guaranteed to be
-   * valid until the application makes another call to any vpx_codec_* function.
+   * This value indicates the amount of data that the encoder should try
+   * to maintain in the decoder's buffer. This value is expressed in units
+   * of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
+   * to convert to bits/bytes, if necessary.
+   */
+  unsigned int rc_buf_optimal_sz;
+
+  /*
+   * 2 pass rate control parameters
+   */
+
+  /*!\brief Two-pass mode CBR/VBR bias
    *
-   * \param[in]     ctx      Pointer to this instance's context
-   * \param[in,out] iter     Iterator storage, initialized to NULL
+   * Bias, expressed on a scale of 0 to 100, for determining target size
+   * for the current frame. The value 0 indicates the optimal CBR mode
+   * value should be used. The value 100 indicates the optimal VBR mode
+   * value should be used. Values in between indicate which way the
+   * encoder should "lean."
+   */
+  unsigned int rc_2pass_vbr_bias_pct;
+
+  /*!\brief Two-pass mode per-GOP minimum bitrate
    *
-   * \return Returns a pointer to an output data packet (compressed frame data,
-   *         two-pass statistics, etc.) or NULL to signal end-of-list.
+   * This value, expressed as a percentage of the target bitrate, indicates
+   * the minimum bitrate to be used for a single GOP (aka "section")
+   */
+  unsigned int rc_2pass_vbr_minsection_pct;
+
+  /*!\brief Two-pass mode per-GOP maximum bitrate
    *
+   * This value, expressed as a percentage of the target bitrate, indicates
+   * the maximum bitrate to be used for a single GOP (aka "section")
    */
-  const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t   *ctx,
-                                                  vpx_codec_iter_t  *iter);
+  unsigned int rc_2pass_vbr_maxsection_pct;
 
+  /*
+   * keyframing settings (kf)
+   */
 
-  /*!\brief Get Preview Frame
-   *
-   * Returns an image that can be used as a preview. Shows the image as it would
-   * exist at the decompressor. The application \ref MUST NOT write into this
-   * image buffer.
+  /*!\brief Keyframe placement mode
    *
-   * \param[in]     ctx      Pointer to this instance's context
+   * This value indicates whether the encoder should place keyframes at a
+   * fixed interval, or determine the optimal placement automatically
+   * (as governed by the #kf_min_dist and #kf_max_dist parameters)
+   */
+  enum vpx_kf_mode kf_mode;
+
+  /*!\brief Keyframe minimum interval
    *
-   * \return Returns a pointer to a preview image, or NULL if no image is
-   *         available.
+   * This value, expressed as a number of frames, prevents the encoder from
+   * placing a keyframe nearer than kf_min_dist to the previous keyframe. At
+   * least kf_min_dist frames non-keyframes will be coded before the next
+   * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
+   */
+  unsigned int kf_min_dist;
+
+  /*!\brief Keyframe maximum interval
    *
+   * This value, expressed as a number of frames, forces the encoder to code
+   * a keyframe if one has not been coded in the last kf_max_dist frames.
+   * A value of 0 implies all frames will be keyframes. Set kf_min_dist
+   * equal to kf_max_dist for a fixed interval.
    */
-  const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t   *ctx);
+  unsigned int kf_max_dist;
+} vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
+
+/*!\brief Initialize an encoder instance
+ *
+ * Initializes a encoder context using the given interface. Applications
+ * should call the vpx_codec_enc_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with --disable-multithread, this call
+ * is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       VPX_ENCODER_ABI_VERSION
+ * \retval #VPX_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #VPX_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
+                                       vpx_codec_iface_t *iface,
+                                       const vpx_codec_enc_cfg_t *cfg,
+                                       vpx_codec_flags_t flags, int ver);
+
+/*!\brief Convenience macro for vpx_codec_enc_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define vpx_codec_enc_init(ctx, iface, cfg, flags) \
+  vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION)
+
+/*!\brief Initialize multi-encoder instance
+ *
+ * Initializes multi-encoder context using the given interface.
+ * Applications should call the vpx_codec_enc_init_multi convenience macro
+ * instead of this function directly, to ensure that the ABI version number
+ * parameter is properly initialized.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    num_enc Total number of encoders.
+ * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+ * \param[in]    dsf     Pointer to down-sampling factors.
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       VPX_ENCODER_ABI_VERSION
+ * \retval #VPX_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #VPX_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+vpx_codec_err_t vpx_codec_enc_init_multi_ver(
+    vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg,
+    int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver);
+
+/*!\brief Convenience macro for vpx_codec_enc_init_multi_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
+  vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf,   \
+                               VPX_ENCODER_ABI_VERSION)
+
+/*!\brief Get a default configuration
+ *
+ * Initializes a encoder configuration structure with default values. Supports
+ * the notion of "usages" so that an algorithm may offer different default
+ * settings depending on the user's intended goal. This function \ref SHOULD
+ * be called by all applications to initialize the configuration structure
+ * before specializing the configuration with application specific values.
+ *
+ * \param[in]    iface     Pointer to the algorithm interface to use.
+ * \param[out]   cfg       Configuration buffer to populate.
+ * \param[in]    reserved  Must set to 0 for VP8 and VP9.
+ *
+ * \retval #VPX_CODEC_OK
+ *     The configuration was populated.
+ * \retval #VPX_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     A parameter was NULL, or the usage value was not recognized.
+ */
+vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
+                                             vpx_codec_enc_cfg_t *cfg,
+                                             unsigned int reserved);
+
+/*!\brief Set or change configuration
+ *
+ * Reconfigures an encoder instance according to the given configuration.
+ *
+ * \param[in]    ctx     Pointer to this instance's context
+ * \param[in]    cfg     Configuration buffer to use
+ *
+ * \retval #VPX_CODEC_OK
+ *     The configuration was populated.
+ * \retval #VPX_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     A parameter was NULL, or the usage value was not recognized.
+ */
+vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
+                                         const vpx_codec_enc_cfg_t *cfg);
+
+/*!\brief Get global stream headers
+ *
+ * Retrieves a stream level global header packet, if supported by the codec.
+ *
+ * \param[in]    ctx     Pointer to this instance's context
+ *
+ * \retval NULL
+ *     Encoder does not support global header
+ * \retval Non-NULL
+ *     Pointer to buffer containing global header packet
+ */
+vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
+
+/*!\brief deadline parameter analogous to VPx REALTIME mode. */
+#define VPX_DL_REALTIME (1)
+/*!\brief deadline parameter analogous to  VPx GOOD QUALITY mode. */
+#define VPX_DL_GOOD_QUALITY (1000000)
+/*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */
+#define VPX_DL_BEST_QUALITY (0)
+/*!\brief Encode a frame
+ *
+ * Encodes a video frame at the given "presentation time." The presentation
+ * time stamp (PTS) \ref MUST be strictly increasing.
+ *
+ * The encoder supports the notion of a soft real-time deadline. Given a
+ * non-zero value to the deadline parameter, the encoder will make a "best
+ * effort" guarantee to  return before the given time slice expires. It is
+ * implicit that limiting the available time to encode will degrade the
+ * output quality. The encoder can be given an unlimited time to produce the
+ * best possible frame by specifying a deadline of '0'. This deadline
+ * supercedes the VPx notion of "best quality, good quality, realtime".
+ * Applications that wish to map these former settings to the new deadline
+ * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY,
+ * and #VPX_DL_BEST_QUALITY.
+ *
+ * When the last frame has been passed to the encoder, this function should
+ * continue to be called, with the img parameter set to NULL. This will
+ * signal the end-of-stream condition to the encoder and allow it to encode
+ * any held buffers. Encoding is complete when vpx_codec_encode() is called
+ * and vpx_codec_get_cx_data() returns no data.
+ *
+ * \param[in]    ctx       Pointer to this instance's context
+ * \param[in]    img       Image data to encode, NULL to flush.
+ * \param[in]    pts       Presentation time stamp, in timebase units.
+ * \param[in]    duration  Duration to show frame, in timebase units.
+ * \param[in]    flags     Flags to use for encoding this frame.
+ * \param[in]    deadline  Time to spend encoding, in microseconds. (0=infinite)
+ *
+ * \retval #VPX_CODEC_OK
+ *     The configuration was populated.
+ * \retval #VPX_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     A parameter was NULL, the image format is unsupported, etc.
+ */
+vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
+                                 vpx_codec_pts_t pts, unsigned long duration,
+                                 vpx_enc_frame_flags_t flags,
+                                 unsigned long deadline);
+
+/*!\brief Set compressed data output buffer
+ *
+ * Sets the buffer that the codec should output the compressed data
+ * into. This call effectively sets the buffer pointer returned in the
+ * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
+ * appended into this buffer. The buffer is preserved across frames,
+ * so applications must periodically call this function after flushing
+ * the accumulated compressed data to disk or to the network to reset
+ * the pointer to the buffer's head.
+ *
+ * `pad_before` bytes will be skipped before writing the compressed
+ * data, and `pad_after` bytes will be appended to the packet. The size
+ * of the packet will be the sum of the size of the actual compressed
+ * data, pad_before, and pad_after. The padding bytes will be preserved
+ * (not overwritten).
+ *
+ * Note that calling this function does not guarantee that the returned
+ * compressed data will be placed into the specified buffer. In the
+ * event that the encoded data will not fit into the buffer provided,
+ * the returned packet \ref MAY point to an internal buffer, as it would
+ * if this call were never used. In this event, the output packet will
+ * NOT have any padding, and the application must free space and copy it
+ * to the proper place. This is of particular note in configurations
+ * that may output multiple packets for a single encoded frame (e.g., lagged
+ * encoding) or if the application does not reset the buffer periodically.
+ *
+ * Applications may restore the default behavior of the codec providing
+ * the compressed data buffer by calling this function with a NULL
+ * buffer.
+ *
+ * Applications \ref MUSTNOT call this function during iteration of
+ * vpx_codec_get_cx_data().
+ *
+ * \param[in]    ctx         Pointer to this instance's context
+ * \param[in]    buf         Buffer to store compressed data into
+ * \param[in]    pad_before  Bytes to skip before writing compressed data
+ * \param[in]    pad_after   Bytes to skip after writing compressed data
+ *
+ * \retval #VPX_CODEC_OK
+ *     The buffer was set successfully.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     A parameter was NULL, the image format is unsupported, etc.
+ */
+vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
+                                          const vpx_fixed_buf_t *buf,
+                                          unsigned int pad_before,
+                                          unsigned int pad_after);
 
+/*!\brief Encoded data iterator
+ *
+ * Iterates over a list of data packets to be passed from the encoder to the
+ * application. The different kinds of packets available are enumerated in
+ * #vpx_codec_cx_pkt_kind.
+ *
+ * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's
+ * muxer. Multiple compressed frames may be in the list.
+ * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer.
+ *
+ * The application \ref MUST silently ignore any packet kinds that it does
+ * not recognize or support.
+ *
+ * The data buffers returned from this function are only guaranteed to be
+ * valid until the application makes another call to any vpx_codec_* function.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in,out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an output data packet (compressed frame data,
+ *         two-pass statistics, etc.) or NULL to signal end-of-list.
+ *
+ */
+const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
+                                                vpx_codec_iter_t *iter);
+
+/*!\brief Get Preview Frame
+ *
+ * Returns an image that can be used as a preview. Shows the image as it would
+ * exist at the decompressor. The application \ref MUST NOT write into this
+ * image buffer.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ *
+ * \return Returns a pointer to a preview image, or NULL if no image is
+ *         available.
+ *
+ */
+const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx);
 
-  /*!@} - end defgroup encoder*/
+/*!@} - end defgroup encoder*/
 #ifdef __cplusplus
 }
 #endif
 #endif  // VPX_VPX_ENCODER_H_
-
diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h
index 109aec445c1ec8d8e8596312021031762cf071b5..8adbe25aad815a51ace428e6b38b1325ffdeba3d 100644
--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h
@@ -37,9 +37,9 @@ extern "C" {
  * This structure holds allocated frame buffers used by the decoder.
  */
 typedef struct vpx_codec_frame_buffer {
-  uint8_t *data;  /**< Pointer to the data buffer */
-  size_t size;  /**< Size of data in bytes */
-  void *priv;  /**< Frame's private data */
+  uint8_t *data; /**< Pointer to the data buffer */
+  size_t size;   /**< Size of data in bytes */
+  void *priv;    /**< Frame's private data */
 } vpx_codec_frame_buffer_t;
 
 /*!\brief get frame buffer callback prototype
@@ -60,8 +60,8 @@ typedef struct vpx_codec_frame_buffer {
  * \param[in] new_size     Size in bytes needed by the buffer
  * \param[in,out] fb       Pointer to vpx_codec_frame_buffer_t
  */
-typedef int (*vpx_get_frame_buffer_cb_fn_t)(
-    void *priv, size_t min_size, vpx_codec_frame_buffer_t *fb);
+typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
+                                            vpx_codec_frame_buffer_t *fb);
 
 /*!\brief release frame buffer callback prototype
  *
@@ -73,8 +73,8 @@ typedef int (*vpx_get_frame_buffer_cb_fn_t)(
  * \param[in] priv         Callback's private data
  * \param[in] fb           Pointer to vpx_codec_frame_buffer_t
  */
-typedef int (*vpx_release_frame_buffer_cb_fn_t)(
-    void *priv, vpx_codec_frame_buffer_t *fb);
+typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv,
+                                                vpx_codec_frame_buffer_t *fb);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 7958c69806ed0fbefdcca9ce1b075a5eb734d815..d6d3166d2ffd08d900deddc69f9278bdf185b6d3 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 /*!\file
  * \brief Describes the vpx image descriptor and associated operations
  *
@@ -20,213 +19,203 @@
 extern "C" {
 #endif
 
-  /*!\brief Current ABI version number
-   *
-   * \internal
-   * If this file is altered in any way that changes the ABI, this value
-   * must be bumped.  Examples include, but are not limited to, changing
-   * types, removing or reassigning enums, adding/removing/rearranging
-   * fields to structures
-   */
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
 #define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
 
-
-#define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
-#define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U in memory. */
-#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel. */
-#define VPX_IMG_FMT_HIGHBITDEPTH 0x800  /**< Image uses 16bit framebuffer. */
-
-  /*!\brief List of supported image formats */
-  typedef enum vpx_img_fmt {
-    VPX_IMG_FMT_NONE,
-    VPX_IMG_FMT_RGB24,   /**< 24 bit per pixel packed RGB */
-    VPX_IMG_FMT_RGB32,   /**< 32 bit per pixel packed 0RGB */
-    VPX_IMG_FMT_RGB565,  /**< 16 bit per pixel, 565 */
-    VPX_IMG_FMT_RGB555,  /**< 16 bit per pixel, 555 */
-    VPX_IMG_FMT_UYVY,    /**< UYVY packed YUV */
-    VPX_IMG_FMT_YUY2,    /**< YUYV packed YUV */
-    VPX_IMG_FMT_YVYU,    /**< YVYU packed YUV */
-    VPX_IMG_FMT_BGR24,   /**< 24 bit per pixel packed BGR */
-    VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */
-    VPX_IMG_FMT_ARGB,     /**< 32 bit packed ARGB, alpha=255 */
-    VPX_IMG_FMT_ARGB_LE,  /**< 32 bit packed BGRA, alpha=255 */
-    VPX_IMG_FMT_RGB565_LE,  /**< 16 bit per pixel, gggbbbbb rrrrrggg */
-    VPX_IMG_FMT_RGB555_LE,  /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
-    VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
-    VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,
-    VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
-    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
-    VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
-    VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
-    VPX_IMG_FMT_I440    = VPX_IMG_FMT_PLANAR | 7,
-    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6,
-    VPX_IMG_FMT_I42016    = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
-    VPX_IMG_FMT_I42216    = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
-    VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
-    VPX_IMG_FMT_I44016    = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
-  } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
-
-  /*!\brief List of supported color spaces */
-  typedef enum vpx_color_space {
-    VPX_CS_UNKNOWN    = 0,  /**< Unknown */
-    VPX_CS_BT_601     = 1,  /**< BT.601 */
-    VPX_CS_BT_709     = 2,  /**< BT.709 */
-    VPX_CS_SMPTE_170  = 3,  /**< SMPTE.170 */
-    VPX_CS_SMPTE_240  = 4,  /**< SMPTE.240 */
-    VPX_CS_BT_2020    = 5,  /**< BT.2020 */
-    VPX_CS_RESERVED   = 6,  /**< Reserved */
-    VPX_CS_SRGB       = 7   /**< sRGB */
-  } vpx_color_space_t; /**< alias for enum vpx_color_space */
-
-  /*!\brief List of supported color range */
-  typedef enum vpx_color_range {
-    VPX_CR_STUDIO_RANGE = 0,    /**< Y [16..235], UV [16..240] */
-    VPX_CR_FULL_RANGE   = 1     /**< YUV/RGB [0..255] */
-  } vpx_color_range_t; /**< alias for enum vpx_color_range */
-
-  /**\brief Image Descriptor */
-  typedef struct vpx_image {
-    vpx_img_fmt_t fmt; /**< Image Format */
-    vpx_color_space_t cs; /**< Color Space */
-    vpx_color_range_t range; /**< Color Range */
-
-    /* Image storage dimensions */
-    unsigned int  w;           /**< Stored image width */
-    unsigned int  h;           /**< Stored image height */
-    unsigned int  bit_depth;   /**< Stored image bit-depth */
-
-    /* Image display dimensions */
-    unsigned int  d_w;   /**< Displayed image width */
-    unsigned int  d_h;   /**< Displayed image height */
-
-    /* Image intended rendering dimensions */
-    unsigned int  r_w;   /**< Intended rendering image width */
-    unsigned int  r_h;   /**< Intended rendering image height */
-
-    /* Chroma subsampling info */
-    unsigned int  x_chroma_shift;   /**< subsampling order, X */
-    unsigned int  y_chroma_shift;   /**< subsampling order, Y */
-
-    /* Image data pointers. */
-#define VPX_PLANE_PACKED 0   /**< To be used for all packed formats */
-#define VPX_PLANE_Y      0   /**< Y (Luminance) plane */
-#define VPX_PLANE_U      1   /**< U (Chroma) plane */
-#define VPX_PLANE_V      2   /**< V (Chroma) plane */
-#define VPX_PLANE_ALPHA  3   /**< A (Transparency) plane */
-    unsigned char *planes[4];  /**< pointer to the top left pixel for each plane */
-    int      stride[4];  /**< stride between rows for each plane */
-
-    int     bps; /**< bits per sample (for packed formats) */
-
-    /* The following member may be set by the application to associate data
-     * with this image.
-     */
-    void    *user_priv; /**< may be set by the application to associate data
-                         *   with this image. */
-
-    /* The following members should be treated as private. */
-    unsigned char *img_data;       /**< private */
-    int      img_data_owner; /**< private */
-    int      self_allocd;    /**< private */
-
-    void    *fb_priv; /**< Frame buffer data associated with the image. */
-  } vpx_image_t; /**< alias for struct vpx_image */
-
-  /**\brief Representation of a rectangle on a surface */
-  typedef struct vpx_image_rect {
-    unsigned int x; /**< leftmost column */
-    unsigned int y; /**< topmost row */
-    unsigned int w; /**< width */
-    unsigned int h; /**< height */
-  } vpx_image_rect_t; /**< alias for struct vpx_image_rect */
-
-  /*!\brief Open a descriptor, allocating storage for the underlying image
-   *
-   * Returns a descriptor for storing an image of the given format. The
-   * storage for the descriptor is allocated on the heap.
-   *
-   * \param[in]    img       Pointer to storage for descriptor. If this parameter
-   *                         is NULL, the storage for the descriptor will be
-   *                         allocated on the heap.
-   * \param[in]    fmt       Format for the image
-   * \param[in]    d_w       Width of the image
-   * \param[in]    d_h       Height of the image
-   * \param[in]    align     Alignment, in bytes, of the image buffer and
-   *                         each row in the image(stride).
-   *
-   * \return Returns a pointer to the initialized image descriptor. If the img
-   *         parameter is non-null, the value of the img parameter will be
-   *         returned.
+#define VPX_IMG_FMT_PLANAR 0x100       /**< Image is a planar format. */
+#define VPX_IMG_FMT_UV_FLIP 0x200      /**< V plane precedes U in memory. */
+#define VPX_IMG_FMT_HAS_ALPHA 0x400    /**< Image has an alpha channel. */
+#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
+
+/*!\brief List of supported image formats */
+typedef enum vpx_img_fmt {
+  VPX_IMG_FMT_NONE,
+  VPX_IMG_FMT_RGB24,     /**< 24 bit per pixel packed RGB */
+  VPX_IMG_FMT_RGB32,     /**< 32 bit per pixel packed 0RGB */
+  VPX_IMG_FMT_RGB565,    /**< 16 bit per pixel, 565 */
+  VPX_IMG_FMT_RGB555,    /**< 16 bit per pixel, 555 */
+  VPX_IMG_FMT_UYVY,      /**< UYVY packed YUV */
+  VPX_IMG_FMT_YUY2,      /**< YUYV packed YUV */
+  VPX_IMG_FMT_YVYU,      /**< YVYU packed YUV */
+  VPX_IMG_FMT_BGR24,     /**< 24 bit per pixel packed BGR */
+  VPX_IMG_FMT_RGB32_LE,  /**< 32 bit packed BGR0 */
+  VPX_IMG_FMT_ARGB,      /**< 32 bit packed ARGB, alpha=255 */
+  VPX_IMG_FMT_ARGB_LE,   /**< 32 bit packed BGRA, alpha=255 */
+  VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */
+  VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
+  VPX_IMG_FMT_YV12 =
+      VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
+  VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2,
+  VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP |
+                        3, /** < planar 4:2:0 format with vpx color space */
+  VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
+  VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5,
+  VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6,
+  VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7,
+  VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6,
+  VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
+  VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
+  VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
+  VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
+} vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+
+/*!\brief List of supported color spaces */
+typedef enum vpx_color_space {
+  VPX_CS_UNKNOWN = 0,   /**< Unknown */
+  VPX_CS_BT_601 = 1,    /**< BT.601 */
+  VPX_CS_BT_709 = 2,    /**< BT.709 */
+  VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */
+  VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */
+  VPX_CS_BT_2020 = 5,   /**< BT.2020 */
+  VPX_CS_RESERVED = 6,  /**< Reserved */
+  VPX_CS_SRGB = 7       /**< sRGB */
+} vpx_color_space_t;    /**< alias for enum vpx_color_space */
+
+/*!\brief List of supported color range */
+typedef enum vpx_color_range {
+  VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
+  VPX_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
+} vpx_color_range_t;       /**< alias for enum vpx_color_range */
+
+/**\brief Image Descriptor */
+typedef struct vpx_image {
+  vpx_img_fmt_t fmt;       /**< Image Format */
+  vpx_color_space_t cs;    /**< Color Space */
+  vpx_color_range_t range; /**< Color Range */
+
+  /* Image storage dimensions */
+  unsigned int w;         /**< Stored image width */
+  unsigned int h;         /**< Stored image height */
+  unsigned int bit_depth; /**< Stored image bit-depth */
+
+  /* Image display dimensions */
+  unsigned int d_w; /**< Displayed image width */
+  unsigned int d_h; /**< Displayed image height */
+
+  /* Image intended rendering dimensions */
+  unsigned int r_w; /**< Intended rendering image width */
+  unsigned int r_h; /**< Intended rendering image height */
+
+  /* Chroma subsampling info */
+  unsigned int x_chroma_shift; /**< subsampling order, X */
+  unsigned int y_chroma_shift; /**< subsampling order, Y */
+
+/* Image data pointers. */
+#define VPX_PLANE_PACKED 0  /**< To be used for all packed formats */
+#define VPX_PLANE_Y 0       /**< Y (Luminance) plane */
+#define VPX_PLANE_U 1       /**< U (Chroma) plane */
+#define VPX_PLANE_V 2       /**< V (Chroma) plane */
+#define VPX_PLANE_ALPHA 3   /**< A (Transparency) plane */
+  unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */
+  int stride[4];            /**< stride between rows for each plane */
+
+  int bps; /**< bits per sample (for packed formats) */
+
+  /*!\brief The following member may be set by the application to associate
+   * data with this image.
    */
-  vpx_image_t *vpx_img_alloc(vpx_image_t  *img,
-                             vpx_img_fmt_t fmt,
-                             unsigned int d_w,
-                             unsigned int d_h,
-                             unsigned int align);
-
-  /*!\brief Open a descriptor, using existing storage for the underlying image
-   *
-   * Returns a descriptor for storing an image of the given format. The
-   * storage for descriptor has been allocated elsewhere, and a descriptor is
-   * desired to "wrap" that storage.
-   *
-   * \param[in]    img       Pointer to storage for descriptor. If this parameter
-   *                         is NULL, the storage for the descriptor will be
-   *                         allocated on the heap.
-   * \param[in]    fmt       Format for the image
-   * \param[in]    d_w       Width of the image
-   * \param[in]    d_h       Height of the image
-   * \param[in]    align     Alignment, in bytes, of each row in the image.
-   * \param[in]    img_data  Storage to use for the image
-   *
-   * \return Returns a pointer to the initialized image descriptor. If the img
-   *         parameter is non-null, the value of the img parameter will be
-   *         returned.
-   */
-  vpx_image_t *vpx_img_wrap(vpx_image_t  *img,
-                            vpx_img_fmt_t fmt,
-                            unsigned int d_w,
-                            unsigned int d_h,
-                            unsigned int align,
-                            unsigned char      *img_data);
-
-
-  /*!\brief Set the rectangle identifying the displayed portion of the image
-   *
-   * Updates the displayed rectangle (aka viewport) on the image surface to
-   * match the specified coordinates and size.
-   *
-   * \param[in]    img       Image descriptor
-   * \param[in]    x         leftmost column
-   * \param[in]    y         topmost row
-   * \param[in]    w         width
-   * \param[in]    h         height
-   *
-   * \return 0 if the requested rectangle is valid, nonzero otherwise.
-   */
-  int vpx_img_set_rect(vpx_image_t  *img,
-                       unsigned int  x,
-                       unsigned int  y,
-                       unsigned int  w,
-                       unsigned int  h);
-
-
-  /*!\brief Flip the image vertically (top for bottom)
-   *
-   * Adjusts the image descriptor's pointers and strides to make the image
-   * be referenced upside-down.
-   *
-   * \param[in]    img       Image descriptor
-   */
-  void vpx_img_flip(vpx_image_t *img);
+  void *user_priv;
 
-  /*!\brief Close an image descriptor
-   *
-   * Frees all allocated storage associated with an image descriptor.
-   *
-   * \param[in]    img       Image descriptor
-   */
-  void vpx_img_free(vpx_image_t *img);
+  /* The following members should be treated as private. */
+  unsigned char *img_data; /**< private */
+  int img_data_owner;      /**< private */
+  int self_allocd;         /**< private */
+
+  void *fb_priv; /**< Frame buffer data associated with the image. */
+} vpx_image_t;   /**< alias for struct vpx_image */
+
+/**\brief Representation of a rectangle on a surface */
+typedef struct vpx_image_rect {
+  unsigned int x;   /**< leftmost column */
+  unsigned int y;   /**< topmost row */
+  unsigned int w;   /**< width */
+  unsigned int h;   /**< height */
+} vpx_image_rect_t; /**< alias for struct vpx_image_rect */
+
+/*!\brief Open a descriptor, allocating storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for the descriptor is allocated on the heap.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of the image buffer and
+ *                         each row in the image(stride).
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
+                           unsigned int d_w, unsigned int d_h,
+                           unsigned int align);
+
+/*!\brief Open a descriptor, using existing storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for descriptor has been allocated elsewhere, and a descriptor is
+ * desired to "wrap" that storage.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of each row in the image.
+ * \param[in]    img_data  Storage to use for the image
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
+                          unsigned int d_h, unsigned int align,
+                          unsigned char *img_data);
+
+/*!\brief Set the rectangle identifying the displayed portion of the image
+ *
+ * Updates the displayed rectangle (aka viewport) on the image surface to
+ * match the specified coordinates and size.
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    x         leftmost column
+ * \param[in]    y         topmost row
+ * \param[in]    w         width
+ * \param[in]    h         height
+ *
+ * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ */
+int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
+                     unsigned int w, unsigned int h);
+
+/*!\brief Flip the image vertically (top for bottom)
+ *
+ * Adjusts the image descriptor's pointers and strides to make the image
+ * be referenced upside-down.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void vpx_img_flip(vpx_image_t *img);
+
+/*!\brief Close an image descriptor
+ *
+ * Frees all allocated storage associated with an image descriptor.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void vpx_img_free(vpx_image_t *img);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 2945c87ca45c73f64824880d3251284ad4b3c684..09bad9222d4356df00036475267705d83659f4d5 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VPX_VPX_INTEGER_H_
 #define VPX_VPX_INTEGER_H_
 
@@ -25,13 +24,13 @@
 #endif
 
 #if defined(VPX_EMULATE_INTTYPES)
-typedef signed char  int8_t;
+typedef signed char int8_t;
 typedef signed short int16_t;
-typedef signed int   int32_t;
+typedef signed int int32_t;
 
-typedef unsigned char  uint8_t;
+typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
-typedef unsigned int   uint32_t;
+typedef unsigned int uint32_t;
 
 #ifndef _UINTPTR_T_DEFINED
 typedef size_t uintptr_t;
@@ -42,12 +41,12 @@ typedef size_t uintptr_t;
 /* Most platforms have the C99 standard integer types. */
 
 #if defined(__cplusplus)
-# if !defined(__STDC_FORMAT_MACROS)
-#  define __STDC_FORMAT_MACROS
-# endif
-# if !defined(__STDC_LIMIT_MACROS)
-#  define __STDC_LIMIT_MACROS
-# endif
+#if !defined(__STDC_FORMAT_MACROS)
+#define __STDC_FORMAT_MACROS
+#endif
+#if !defined(__STDC_LIMIT_MACROS)
+#define __STDC_LIMIT_MACROS
+#endif
 #endif  // __cplusplus
 
 #include <stdint.h>
diff --git a/vpx_dsp/add_noise.c b/vpx_dsp/add_noise.c
index 4ae67a813ec8a33abe642a2bfa590aa81f9922fa..80b1af9dde8d7515c71fdc2c95a5859b6e56f05f 100644
--- a/vpx_dsp/add_noise.c
+++ b/vpx_dsp/add_noise.c
@@ -17,16 +17,14 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-void vpx_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
+void vpx_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
+                           char whiteclamp[16], char bothclamp[16],
                            unsigned int width, unsigned int height, int pitch) {
   unsigned int i, j;
 
   for (i = 0; i < height; ++i) {
     uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+    char *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
 
     for (j = 0; j < width; ++j) {
       int v = pos[j];
@@ -51,7 +49,7 @@ int vpx_setup_noise(double sigma, int size, char *noise) {
 
   // set up a 256 entry lookup that matches gaussian distribution
   for (i = -32; i < 32; ++i) {
-    const int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
+    const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
     if (a_i) {
       for (j = 0; j < a_i; ++j) {
         char_dist[next + j] = (char)i;
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index e52958c547fa16b07c0847ddff31bec9ad8d6e83..001517d33ee71a8095b515fedb28f453d8971368 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -198,27 +198,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
   }
 }
 
-void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
-                         const uint8_t *b, int b_stride,
-                         int *min, int *max) {
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                         int b_stride, int *min, int *max) {
   // Load and concatenate.
-  const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
-                                     vld1_u8(a + a_stride));
-  const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
-                                     vld1_u8(a + 3 * a_stride));
-  const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
-                                     vld1_u8(a + 5 * a_stride));
-  const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
-                                     vld1_u8(a + 7 * a_stride));
-
-  const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
-                                     vld1_u8(b + b_stride));
-  const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
-                                     vld1_u8(b + 3 * b_stride));
-  const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
-                                     vld1_u8(b + 5 * b_stride));
-  const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
-                                     vld1_u8(b + 7 * b_stride));
+  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
+  const uint8x16_t a23 =
+      vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
+  const uint8x16_t a45 =
+      vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
+  const uint8x16_t a67 =
+      vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
+
+  const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
+  const uint8x16_t b23 =
+      vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
+  const uint8x16_t b45 =
+      vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
+  const uint8x16_t b67 =
+      vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
 
   // Absolute difference.
   const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
index 9f9de98d90eb65093d5c0617b51eadbcce6d765b..7cb2ba90d2fe6cd63ecac35bc432fac58b9e35ba 100644
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -131,14 +131,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
       // 14 15 16 17 54 55 56 57
       // 24 25 26 27 64 65 66 67
       // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
-                                            vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
-                                            vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
-                                            vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
-                                            vreinterpretq_s32_s16(out_7));
+      const int32x4x2_t r02_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
       const int16x8x2_t r01_s16 =
           vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
                     vreinterpretq_s16_s32(r13_s32.val[0]));
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index 21e3e3dbacfb79ae044954081a4f3138d5514b98..46b2755ea68ed83266cfa591543587b43396dd64 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -12,9 +12,8 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
-                                 int16x8_t *a2, int16x8_t *a3,
-                                 int16x8_t *a4, int16x8_t *a5,
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
                                  int16x8_t *a6, int16x8_t *a7) {
   const int16x8_t b0 = vaddq_s16(*a0, *a1);
   const int16x8_t b1 = vsubq_s16(*a0, *a1);
@@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
 // TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
 // reversing transpose order which may make it easier for the compiler to
 // reconcile the vtrn.64 moves.
-static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
-                         int16x8_t *a2, int16x8_t *a3,
-                         int16x8_t *a4, int16x8_t *a5,
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
                          int16x8_t *a6, int16x8_t *a7) {
   // Swap 64 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
@@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
   // a1657_hi:
   // 12 13 28 29 44 45 60 61
   // 14 15 30 31 46 47 62 63
-  const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo),
-                                         vreinterpretq_s32_s16(a26_lo));
-  const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo),
-                                         vreinterpretq_s32_s16(a37_lo));
-  const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi),
-                                         vreinterpretq_s32_s16(a26_hi));
-  const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi),
-                                         vreinterpretq_s32_s16(a37_hi));
+  const int32x4x2_t a0246_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
+  const int32x4x2_t a1357_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
+  const int32x4x2_t a0246_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
+  const int32x4x2_t a1357_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
 
   // Swap 16 bit elements resulting in:
   // b0:
diff --git a/vpx_dsp/arm/idct16x16_1_add_neon.c b/vpx_dsp/arm/idct16x16_1_add_neon.c
index f734e48027944b9630e39768c3258269fa1fb53c..466b408893e2bfc9d367e65658e5f6de3b59c55d 100644
--- a/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -13,49 +13,46 @@
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
-void vpx_idct16x16_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d2u8, d3u8, d30u8, d31u8;
-    uint64x1_t d2u64, d3u64, d4u64, d5u64;
-    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-    int16x8_t q0s16;
-    uint8_t *d1, *d2;
-    int16_t i, j, a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 6);
-
-    q0s16 = vdupq_n_s16(a1);
-    q0u16 = vreinterpretq_u16_s16(q0s16);
-
-    for (d1 = d2 = dest, i = 0; i < 4; i++) {
-        for (j = 0; j < 2; j++) {
-            d2u64 = vld1_u64((const uint64_t *)d1);
-            d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
-            d1 += dest_stride;
-            d4u64 = vld1_u64((const uint64_t *)d1);
-            d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
-            d1 += dest_stride;
-
-            q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-            q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-            q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-            q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-            d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-            d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-            d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-            d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
-            d2 += dest_stride;
-            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
-            d2 += dest_stride;
-        }
+void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d2u8, d3u8, d30u8, d31u8;
+  uint64x1_t d2u64, d3u64, d4u64, d5u64;
+  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, j, a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  q0s16 = vdupq_n_s16(a1);
+  q0u16 = vreinterpretq_u16_s16(q0s16);
+
+  for (d1 = d2 = dest, i = 0; i < 4; i++) {
+    for (j = 0; j < 2; j++) {
+      d2u64 = vld1_u64((const uint64_t *)d1);
+      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+      d1 += dest_stride;
+      d4u64 = vld1_u64((const uint64_t *)d1);
+      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+      d1 += dest_stride;
+
+      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+      d2 += dest_stride;
+      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+      d2 += dest_stride;
     }
-    return;
+  }
+  return;
 }
diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c
index 651ebb21f9967d4ac37380f403e7907ce8f3a2d9..6c03aff609b911e62c55b744ee8e1c9b1fdad24d 100644
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -13,1175 +13,736 @@
 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void TRANSPOSE8X8(
-        int16x8_t *q8s16,
-        int16x8_t *q9s16,
-        int16x8_t *q10s16,
-        int16x8_t *q11s16,
-        int16x8_t *q12s16,
-        int16x8_t *q13s16,
-        int16x8_t *q14s16,
-        int16x8_t *q15s16) {
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-    d16s16 = vget_low_s16(*q8s16);
-    d17s16 = vget_high_s16(*q8s16);
-    d18s16 = vget_low_s16(*q9s16);
-    d19s16 = vget_high_s16(*q9s16);
-    d20s16 = vget_low_s16(*q10s16);
-    d21s16 = vget_high_s16(*q10s16);
-    d22s16 = vget_low_s16(*q11s16);
-    d23s16 = vget_high_s16(*q11s16);
-    d24s16 = vget_low_s16(*q12s16);
-    d25s16 = vget_high_s16(*q12s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-    d30s16 = vget_low_s16(*q15s16);
-    d31s16 = vget_high_s16(*q15s16);
-
-    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
-    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
-    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-    *q12s16 = vcombine_s16(d17s16, d25s16);
-    *q13s16 = vcombine_s16(d19s16, d27s16);
-    *q14s16 = vcombine_s16(d21s16, d29s16);
-    *q15s16 = vcombine_s16(d23s16, d31s16);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
-                        vreinterpretq_s32_s16(*q10s16));
-    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
-                        vreinterpretq_s32_s16(*q11s16));
-    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
-                        vreinterpretq_s32_s16(*q14s16));
-    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
-                        vreinterpretq_s32_s16(*q15s16));
-
-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-    *q8s16  = q0x2s16.val[0];
-    *q9s16  = q0x2s16.val[1];
-    *q10s16 = q1x2s16.val[0];
-    *q11s16 = q1x2s16.val[1];
-    *q12s16 = q2x2s16.val[0];
-    *q13s16 = q2x2s16.val[1];
-    *q14s16 = q3x2s16.val[0];
-    *q15s16 = q3x2s16.val[1];
-    return;
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+                                int16x8_t *q10s16, int16x8_t *q11s16,
+                                int16x8_t *q12s16, int16x8_t *q13s16,
+                                int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+  *q12s16 = vcombine_s16(d17s16, d25s16);
+  *q13s16 = vcombine_s16(d19s16, d27s16);
+  *q14s16 = vcombine_s16(d21s16, d29s16);
+  *q15s16 = vcombine_s16(d23s16, d31s16);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+  q1x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+  q2x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+  q3x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+  *q8s16 = q0x2s16.val[0];
+  *q9s16 = q0x2s16.val[1];
+  *q10s16 = q1x2s16.val[0];
+  *q11s16 = q1x2s16.val[1];
+  *q12s16 = q2x2s16.val[0];
+  *q13s16 = q2x2s16.val[1];
+  *q14s16 = q3x2s16.val[0];
+  *q15s16 = q3x2s16.val[1];
+  return;
 }
 
-void vpx_idct16x16_256_add_neon_pass1(
-        int16_t *in,
-        int16_t *out,
-        int output_stride) {
-    int16x4_t d0s16, d1s16, d2s16, d3s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-    int16x8x2_t q0x2s16;
-
-    q0x2s16 = vld2q_s16(in);
-    q8s16  = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q9s16  = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q10s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q11s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q12s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q13s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q14s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-    d30s16 = vget_low_s16(q15s16);
-    d31s16 = vget_high_s16(q15s16);
-
-    // stage 3
-    d0s16 = vdup_n_s16(cospi_28_64);
-    d1s16 = vdup_n_s16(cospi_4_64);
-
-    q2s32 = vmull_s16(d18s16, d0s16);
-    q3s32 = vmull_s16(d19s16, d0s16);
-    q5s32 = vmull_s16(d18s16, d1s16);
-    q6s32 = vmull_s16(d19s16, d1s16);
-
-    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
-    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
-    d2s16 = vdup_n_s16(cospi_12_64);
-    d3s16 = vdup_n_s16(cospi_20_64);
-
-    d8s16 = vqrshrn_n_s32(q2s32, 14);
-    d9s16 = vqrshrn_n_s32(q3s32, 14);
-    d14s16 = vqrshrn_n_s32(q5s32, 14);
-    d15s16 = vqrshrn_n_s32(q6s32, 14);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-    q7s16 = vcombine_s16(d14s16, d15s16);
-
-    q2s32 = vmull_s16(d26s16, d2s16);
-    q3s32 = vmull_s16(d27s16, d2s16);
-    q9s32 = vmull_s16(d26s16, d3s16);
-    q15s32 = vmull_s16(d27s16, d3s16);
-
-    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
-    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
-    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
-    d10s16 = vqrshrn_n_s32(q2s32, 14);
-    d11s16 = vqrshrn_n_s32(q3s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q15s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 4
-    d30s16 = vdup_n_s16(cospi_16_64);
-
-    q2s32 = vmull_s16(d16s16, d30s16);
-    q11s32 = vmull_s16(d17s16, d30s16);
-    q0s32 = vmull_s16(d24s16, d30s16);
-    q1s32 = vmull_s16(d25s16, d30s16);
-
-    d30s16 = vdup_n_s16(cospi_24_64);
-    d31s16 = vdup_n_s16(cospi_8_64);
-
-    q3s32 = vaddq_s32(q2s32, q0s32);
-    q12s32 = vaddq_s32(q11s32, q1s32);
-    q13s32 = vsubq_s32(q2s32, q0s32);
-    q1s32 = vsubq_s32(q11s32, q1s32);
-
-    d16s16 = vqrshrn_n_s32(q3s32, 14);
-    d17s16 = vqrshrn_n_s32(q12s32, 14);
-    d18s16 = vqrshrn_n_s32(q13s32, 14);
-    d19s16 = vqrshrn_n_s32(q1s32, 14);
-    q8s16 = vcombine_s16(d16s16, d17s16);
-    q9s16 = vcombine_s16(d18s16, d19s16);
-
-    q0s32 = vmull_s16(d20s16, d31s16);
-    q1s32 = vmull_s16(d21s16, d31s16);
-    q12s32 = vmull_s16(d20s16, d30s16);
-    q13s32 = vmull_s16(d21s16, d30s16);
-
-    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
-    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
-    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
-    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
-    d22s16 = vqrshrn_n_s32(q0s32, 14);
-    d23s16 = vqrshrn_n_s32(q1s32, 14);
-    d20s16 = vqrshrn_n_s32(q12s32, 14);
-    d21s16 = vqrshrn_n_s32(q13s32, 14);
-    q10s16 = vcombine_s16(d20s16, d21s16);
-    q11s16 = vcombine_s16(d22s16, d23s16);
-
-    q13s16 = vsubq_s16(q4s16, q5s16);
-    q4s16 = vaddq_s16(q4s16, q5s16);
-    q14s16 = vsubq_s16(q7s16, q6s16);
-    q15s16 = vaddq_s16(q6s16, q7s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-
-    // stage 5
-    q0s16 = vaddq_s16(q8s16, q11s16);
-    q1s16 = vaddq_s16(q9s16, q10s16);
-    q2s16 = vsubq_s16(q9s16, q10s16);
-    q3s16 = vsubq_s16(q8s16, q11s16);
-
-    d16s16 = vdup_n_s16(cospi_16_64);
-
-    q11s32 = vmull_s16(d26s16, d16s16);
-    q12s32 = vmull_s16(d27s16, d16s16);
-    q9s32 = vmull_s16(d28s16, d16s16);
-    q10s32 = vmull_s16(d29s16, d16s16);
-
-    q6s32 = vsubq_s32(q9s32, q11s32);
-    q13s32 = vsubq_s32(q10s32, q12s32);
-    q9s32 = vaddq_s32(q9s32, q11s32);
-    q10s32 = vaddq_s32(q10s32, q12s32);
-
-    d10s16 = vqrshrn_n_s32(q6s32, 14);
-    d11s16 = vqrshrn_n_s32(q13s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q10s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 6
-    q8s16 = vaddq_s16(q0s16, q15s16);
-    q9s16 = vaddq_s16(q1s16, q6s16);
-    q10s16 = vaddq_s16(q2s16, q5s16);
-    q11s16 = vaddq_s16(q3s16, q4s16);
-    q12s16 = vsubq_s16(q3s16, q4s16);
-    q13s16 = vsubq_s16(q2s16, q5s16);
-    q14s16 = vsubq_s16(q1s16, q6s16);
+void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
+                                      int output_stride) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = vld2q_s16(in);
+  q8s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q9s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q10s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q11s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q12s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q13s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q14s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+  d30s16 = vget_low_s16(q15s16);
+  d31s16 = vget_high_s16(q15s16);
+
+  // stage 3
+  d0s16 = vdup_n_s16(cospi_28_64);
+  d1s16 = vdup_n_s16(cospi_4_64);
+
+  q2s32 = vmull_s16(d18s16, d0s16);
+  q3s32 = vmull_s16(d19s16, d0s16);
+  q5s32 = vmull_s16(d18s16, d1s16);
+  q6s32 = vmull_s16(d19s16, d1s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+  q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+  q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+  d2s16 = vdup_n_s16(cospi_12_64);
+  d3s16 = vdup_n_s16(cospi_20_64);
+
+  d8s16 = vqrshrn_n_s32(q2s32, 14);
+  d9s16 = vqrshrn_n_s32(q3s32, 14);
+  d14s16 = vqrshrn_n_s32(q5s32, 14);
+  d15s16 = vqrshrn_n_s32(q6s32, 14);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  q2s32 = vmull_s16(d26s16, d2s16);
+  q3s32 = vmull_s16(d27s16, d2s16);
+  q9s32 = vmull_s16(d26s16, d3s16);
+  q15s32 = vmull_s16(d27s16, d3s16);
+
+  q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+  q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+  q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+  d10s16 = vqrshrn_n_s32(q2s32, 14);
+  d11s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q15s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 4
+  d30s16 = vdup_n_s16(cospi_16_64);
+
+  q2s32 = vmull_s16(d16s16, d30s16);
+  q11s32 = vmull_s16(d17s16, d30s16);
+  q0s32 = vmull_s16(d24s16, d30s16);
+  q1s32 = vmull_s16(d25s16, d30s16);
+
+  d30s16 = vdup_n_s16(cospi_24_64);
+  d31s16 = vdup_n_s16(cospi_8_64);
+
+  q3s32 = vaddq_s32(q2s32, q0s32);
+  q12s32 = vaddq_s32(q11s32, q1s32);
+  q13s32 = vsubq_s32(q2s32, q0s32);
+  q1s32 = vsubq_s32(q11s32, q1s32);
+
+  d16s16 = vqrshrn_n_s32(q3s32, 14);
+  d17s16 = vqrshrn_n_s32(q12s32, 14);
+  d18s16 = vqrshrn_n_s32(q13s32, 14);
+  d19s16 = vqrshrn_n_s32(q1s32, 14);
+  q8s16 = vcombine_s16(d16s16, d17s16);
+  q9s16 = vcombine_s16(d18s16, d19s16);
+
+  q0s32 = vmull_s16(d20s16, d31s16);
+  q1s32 = vmull_s16(d21s16, d31s16);
+  q12s32 = vmull_s16(d20s16, d30s16);
+  q13s32 = vmull_s16(d21s16, d30s16);
+
+  q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+  q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+  q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+  q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+  d22s16 = vqrshrn_n_s32(q0s32, 14);
+  d23s16 = vqrshrn_n_s32(q1s32, 14);
+  d20s16 = vqrshrn_n_s32(q12s32, 14);
+  d21s16 = vqrshrn_n_s32(q13s32, 14);
+  q10s16 = vcombine_s16(d20s16, d21s16);
+  q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  q14s16 = vsubq_s16(q7s16, q6s16);
+  q15s16 = vaddq_s16(q6s16, q7s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+
+  // stage 5
+  q0s16 = vaddq_s16(q8s16, q11s16);
+  q1s16 = vaddq_s16(q9s16, q10s16);
+  q2s16 = vsubq_s16(q9s16, q10s16);
+  q3s16 = vsubq_s16(q8s16, q11s16);
+
+  d16s16 = vdup_n_s16(cospi_16_64);
+
+  q11s32 = vmull_s16(d26s16, d16s16);
+  q12s32 = vmull_s16(d27s16, d16s16);
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+
+  q6s32 = vsubq_s32(q9s32, q11s32);
+  q13s32 = vsubq_s32(q10s32, q12s32);
+  q9s32 = vaddq_s32(q9s32, q11s32);
+  q10s32 = vaddq_s32(q10s32, q12s32);
+
+  d10s16 = vqrshrn_n_s32(q6s32, 14);
+  d11s16 = vqrshrn_n_s32(q13s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q10s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 6
+  q8s16 = vaddq_s16(q0s16, q15s16);
+  q9s16 = vaddq_s16(q1s16, q6s16);
+  q10s16 = vaddq_s16(q2s16, q5s16);
+  q11s16 = vaddq_s16(q3s16, q4s16);
+  q12s16 = vsubq_s16(q3s16, q4s16);
+  q13s16 = vsubq_s16(q2s16, q5s16);
+  q14s16 = vsubq_s16(q1s16, q6s16);
+  q15s16 = vsubq_s16(q0s16, q15s16);
+
+  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+  // store the data
+  output_stride >>= 1;  // output_stride / 2, out is int16_t
+  vst1_u64((uint64_t *)out, d16u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d17u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d18u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d19u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d20u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d21u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d22u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d23u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d24u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d28u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d29u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d30u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d31u64);
+  return;
+}
+
+void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
+                                      int16_t *pass1Output, int16_t skip_adding,
+                                      uint8_t *dest, int dest_stride) {
+  uint8_t *d;
+  uint8x8_t d12u8, d13u8;
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64;
+  int64x1_t d12s64, d13s64;
+  uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+  uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = vld2q_s16(src);
+  q8s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q9s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q10s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q11s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q12s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q13s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q14s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+  d30s16 = vget_low_s16(q15s16);
+  d31s16 = vget_high_s16(q15s16);
+
+  // stage 3
+  d12s16 = vdup_n_s16(cospi_30_64);
+  d13s16 = vdup_n_s16(cospi_2_64);
+
+  q2s32 = vmull_s16(d16s16, d12s16);
+  q3s32 = vmull_s16(d17s16, d12s16);
+  q1s32 = vmull_s16(d16s16, d13s16);
+  q4s32 = vmull_s16(d17s16, d13s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+  q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+  q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+  d0s16 = vqrshrn_n_s32(q2s32, 14);
+  d1s16 = vqrshrn_n_s32(q3s32, 14);
+  d14s16 = vqrshrn_n_s32(q1s32, 14);
+  d15s16 = vqrshrn_n_s32(q4s32, 14);
+  q0s16 = vcombine_s16(d0s16, d1s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  d30s16 = vdup_n_s16(cospi_14_64);
+  d31s16 = vdup_n_s16(cospi_18_64);
+
+  q2s32 = vmull_s16(d24s16, d30s16);
+  q3s32 = vmull_s16(d25s16, d30s16);
+  q4s32 = vmull_s16(d24s16, d31s16);
+  q5s32 = vmull_s16(d25s16, d31s16);
+
+  q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+  q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+  q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+  q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+  d2s16 = vqrshrn_n_s32(q2s32, 14);
+  d3s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q4s32, 14);
+  d13s16 = vqrshrn_n_s32(q5s32, 14);
+  q1s16 = vcombine_s16(d2s16, d3s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  d30s16 = vdup_n_s16(cospi_22_64);
+  d31s16 = vdup_n_s16(cospi_10_64);
+
+  q11s32 = vmull_s16(d20s16, d30s16);
+  q12s32 = vmull_s16(d21s16, d30s16);
+  q4s32 = vmull_s16(d20s16, d31s16);
+  q5s32 = vmull_s16(d21s16, d31s16);
+
+  q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+  q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+  q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+  q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+  d4s16 = vqrshrn_n_s32(q11s32, 14);
+  d5s16 = vqrshrn_n_s32(q12s32, 14);
+  d11s16 = vqrshrn_n_s32(q5s32, 14);
+  d10s16 = vqrshrn_n_s32(q4s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  d30s16 = vdup_n_s16(cospi_6_64);
+  d31s16 = vdup_n_s16(cospi_26_64);
+
+  q10s32 = vmull_s16(d28s16, d30s16);
+  q11s32 = vmull_s16(d29s16, d30s16);
+  q12s32 = vmull_s16(d28s16, d31s16);
+  q13s32 = vmull_s16(d29s16, d31s16);
+
+  q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+  q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+  q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+  q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+  d6s16 = vqrshrn_n_s32(q10s32, 14);
+  d7s16 = vqrshrn_n_s32(q11s32, 14);
+  d8s16 = vqrshrn_n_s32(q12s32, 14);
+  d9s16 = vqrshrn_n_s32(q13s32, 14);
+  q3s16 = vcombine_s16(d6s16, d7s16);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+
+  // stage 3
+  q9s16 = vsubq_s16(q0s16, q1s16);
+  q0s16 = vaddq_s16(q0s16, q1s16);
+  q10s16 = vsubq_s16(q3s16, q2s16);
+  q11s16 = vaddq_s16(q2s16, q3s16);
+  q12s16 = vaddq_s16(q4s16, q5s16);
+  q13s16 = vsubq_s16(q4s16, q5s16);
+  q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q6s16, q7s16);
+
+  // stage 4
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+
+  d30s16 = vdup_n_s16(cospi_8_64);
+  d31s16 = vdup_n_s16(cospi_24_64);
+
+  q2s32 = vmull_s16(d18s16, d31s16);
+  q3s32 = vmull_s16(d19s16, d31s16);
+  q4s32 = vmull_s16(d28s16, d31s16);
+  q5s32 = vmull_s16(d29s16, d31s16);
+
+  q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+  q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+  q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+  q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+  d12s16 = vqrshrn_n_s32(q2s32, 14);
+  d13s16 = vqrshrn_n_s32(q3s32, 14);
+  d2s16 = vqrshrn_n_s32(q4s32, 14);
+  d3s16 = vqrshrn_n_s32(q5s32, 14);
+  q1s16 = vcombine_s16(d2s16, d3s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  q3s16 = q11s16;
+  q4s16 = q12s16;
+
+  d30s16 = vdup_n_s16(-cospi_8_64);
+  q11s32 = vmull_s16(d26s16, d30s16);
+  q12s32 = vmull_s16(d27s16, d30s16);
+  q8s32 = vmull_s16(d20s16, d30s16);
+  q9s32 = vmull_s16(d21s16, d30s16);
+
+  q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+  q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+  q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+  q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+  d4s16 = vqrshrn_n_s32(q11s32, 14);
+  d5s16 = vqrshrn_n_s32(q12s32, 14);
+  d10s16 = vqrshrn_n_s32(q8s32, 14);
+  d11s16 = vqrshrn_n_s32(q9s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  // stage 5
+  q8s16 = vaddq_s16(q0s16, q3s16);
+  q9s16 = vaddq_s16(q1s16, q2s16);
+  q10s16 = vsubq_s16(q1s16, q2s16);
+  q11s16 = vsubq_s16(q0s16, q3s16);
+  q12s16 = vsubq_s16(q7s16, q4s16);
+  q13s16 = vsubq_s16(q6s16, q5s16);
+  q14s16 = vaddq_s16(q6s16, q5s16);
+  q15s16 = vaddq_s16(q7s16, q4s16);
+
+  // stage 6
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+
+  d14s16 = vdup_n_s16(cospi_16_64);
+
+  q3s32 = vmull_s16(d26s16, d14s16);
+  q4s32 = vmull_s16(d27s16, d14s16);
+  q0s32 = vmull_s16(d20s16, d14s16);
+  q1s32 = vmull_s16(d21s16, d14s16);
+
+  q5s32 = vsubq_s32(q3s32, q0s32);
+  q6s32 = vsubq_s32(q4s32, q1s32);
+  q10s32 = vaddq_s32(q3s32, q0s32);
+  q4s32 = vaddq_s32(q4s32, q1s32);
+
+  d4s16 = vqrshrn_n_s32(q5s32, 14);
+  d5s16 = vqrshrn_n_s32(q6s32, 14);
+  d10s16 = vqrshrn_n_s32(q10s32, 14);
+  d11s16 = vqrshrn_n_s32(q4s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q0s32 = vmull_s16(d22s16, d14s16);
+  q1s32 = vmull_s16(d23s16, d14s16);
+  q13s32 = vmull_s16(d24s16, d14s16);
+  q6s32 = vmull_s16(d25s16, d14s16);
+
+  q10s32 = vsubq_s32(q13s32, q0s32);
+  q4s32 = vsubq_s32(q6s32, q1s32);
+  q13s32 = vaddq_s32(q13s32, q0s32);
+  q6s32 = vaddq_s32(q6s32, q1s32);
+
+  d6s16 = vqrshrn_n_s32(q10s32, 14);
+  d7s16 = vqrshrn_n_s32(q4s32, 14);
+  d8s16 = vqrshrn_n_s32(q13s32, 14);
+  d9s16 = vqrshrn_n_s32(q6s32, 14);
+  q3s16 = vcombine_s16(d6s16, d7s16);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+
+  // stage 7
+  if (skip_adding != 0) {
+    d = dest;
+    // load the data in pass1
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+
+    q12s16 = vaddq_s16(q0s16, q15s16);
+    q13s16 = vaddq_s16(q1s16, q14s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q14s16 = vsubq_s16(q1s16, q14s16);
     q15s16 = vsubq_s16(q0s16, q15s16);
 
-    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-    // store the data
-    output_stride >>= 1;  // output_stride / 2, out is int16_t
-    vst1_u64((uint64_t *)out, d16u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d17u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d18u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d19u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d20u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d21u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d22u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d23u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d24u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d28u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d29u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d30u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d31u64);
-    return;
-}
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q12s16 = vaddq_s16(q10s16, q5s16);
+    q13s16 = vaddq_s16(q11s16, q4s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q4s16 = vsubq_s16(q11s16, q4s16);
+    q5s16 = vsubq_s16(q10s16, q5s16);
 
-void vpx_idct16x16_256_add_neon_pass2(
-        int16_t *src,
-        int16_t *out,
-        int16_t *pass1Output,
-        int16_t skip_adding,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8_t *d;
-    uint8x8_t d12u8, d13u8;
-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64;
-    int64x1_t d12s64, d13s64;
-    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
-    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32;
-    int16x8x2_t q0x2s16;
-
-    q0x2s16 = vld2q_s16(src);
-    q8s16  = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q9s16  = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q10s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q11s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q12s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q13s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q14s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-    d30s16 = vget_low_s16(q15s16);
-    d31s16 = vget_high_s16(q15s16);
-
-    // stage 3
-    d12s16 = vdup_n_s16(cospi_30_64);
-    d13s16 = vdup_n_s16(cospi_2_64);
-
-    q2s32 = vmull_s16(d16s16, d12s16);
-    q3s32 = vmull_s16(d17s16, d12s16);
-    q1s32 = vmull_s16(d16s16, d13s16);
-    q4s32 = vmull_s16(d17s16, d13s16);
-
-    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
-    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
-    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
-    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
-    d0s16 = vqrshrn_n_s32(q2s32, 14);
-    d1s16 = vqrshrn_n_s32(q3s32, 14);
-    d14s16 = vqrshrn_n_s32(q1s32, 14);
-    d15s16 = vqrshrn_n_s32(q4s32, 14);
-    q0s16 = vcombine_s16(d0s16, d1s16);
-    q7s16 = vcombine_s16(d14s16, d15s16);
-
-    d30s16 = vdup_n_s16(cospi_14_64);
-    d31s16 = vdup_n_s16(cospi_18_64);
-
-    q2s32 = vmull_s16(d24s16, d30s16);
-    q3s32 = vmull_s16(d25s16, d30s16);
-    q4s32 = vmull_s16(d24s16, d31s16);
-    q5s32 = vmull_s16(d25s16, d31s16);
-
-    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
-    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
-    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
-    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
-    d2s16 = vqrshrn_n_s32(q2s32, 14);
-    d3s16 = vqrshrn_n_s32(q3s32, 14);
-    d12s16 = vqrshrn_n_s32(q4s32, 14);
-    d13s16 = vqrshrn_n_s32(q5s32, 14);
-    q1s16 = vcombine_s16(d2s16, d3s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    d30s16 = vdup_n_s16(cospi_22_64);
-    d31s16 = vdup_n_s16(cospi_10_64);
-
-    q11s32 = vmull_s16(d20s16, d30s16);
-    q12s32 = vmull_s16(d21s16, d30s16);
-    q4s32 = vmull_s16(d20s16, d31s16);
-    q5s32 = vmull_s16(d21s16, d31s16);
-
-    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
-    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
-    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
-    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
-    d4s16 = vqrshrn_n_s32(q11s32, 14);
-    d5s16 = vqrshrn_n_s32(q12s32, 14);
-    d11s16 = vqrshrn_n_s32(q5s32, 14);
-    d10s16 = vqrshrn_n_s32(q4s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    d30s16 = vdup_n_s16(cospi_6_64);
-    d31s16 = vdup_n_s16(cospi_26_64);
-
-    q10s32 = vmull_s16(d28s16, d30s16);
-    q11s32 = vmull_s16(d29s16, d30s16);
-    q12s32 = vmull_s16(d28s16, d31s16);
-    q13s32 = vmull_s16(d29s16, d31s16);
-
-    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
-    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
-    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
-    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
-    d6s16 = vqrshrn_n_s32(q10s32, 14);
-    d7s16 = vqrshrn_n_s32(q11s32, 14);
-    d8s16 = vqrshrn_n_s32(q12s32, 14);
-    d9s16 = vqrshrn_n_s32(q13s32, 14);
-    q3s16 = vcombine_s16(d6s16, d7s16);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-
-    // stage 3
-    q9s16  = vsubq_s16(q0s16, q1s16);
-    q0s16  = vaddq_s16(q0s16, q1s16);
-    q10s16 = vsubq_s16(q3s16, q2s16);
-    q11s16 = vaddq_s16(q2s16, q3s16);
-    q12s16 = vaddq_s16(q4s16, q5s16);
-    q13s16 = vsubq_s16(q4s16, q5s16);
-    q14s16 = vsubq_s16(q7s16, q6s16);
-    q7s16  = vaddq_s16(q6s16, q7s16);
-
-    // stage 4
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-
-    d30s16 = vdup_n_s16(cospi_8_64);
-    d31s16 = vdup_n_s16(cospi_24_64);
-
-    q2s32 = vmull_s16(d18s16, d31s16);
-    q3s32 = vmull_s16(d19s16, d31s16);
-    q4s32 = vmull_s16(d28s16, d31s16);
-    q5s32 = vmull_s16(d29s16, d31s16);
-
-    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
-    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
-    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
-    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
-    d12s16 = vqrshrn_n_s32(q2s32, 14);
-    d13s16 = vqrshrn_n_s32(q3s32, 14);
-    d2s16 = vqrshrn_n_s32(q4s32, 14);
-    d3s16 = vqrshrn_n_s32(q5s32, 14);
-    q1s16 = vcombine_s16(d2s16, d3s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    q3s16 = q11s16;
-    q4s16 = q12s16;
-
-    d30s16 = vdup_n_s16(-cospi_8_64);
-    q11s32 = vmull_s16(d26s16, d30s16);
-    q12s32 = vmull_s16(d27s16, d30s16);
-    q8s32 = vmull_s16(d20s16, d30s16);
-    q9s32 = vmull_s16(d21s16, d30s16);
-
-    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
-    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
-    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
-    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
-    d4s16 = vqrshrn_n_s32(q11s32, 14);
-    d5s16 = vqrshrn_n_s32(q12s32, 14);
-    d10s16 = vqrshrn_n_s32(q8s32, 14);
-    d11s16 = vqrshrn_n_s32(q9s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    // stage 5
-    q8s16  = vaddq_s16(q0s16, q3s16);
-    q9s16  = vaddq_s16(q1s16, q2s16);
-    q10s16 = vsubq_s16(q1s16, q2s16);
-    q11s16 = vsubq_s16(q0s16, q3s16);
-    q12s16 = vsubq_s16(q7s16, q4s16);
-    q13s16 = vsubq_s16(q6s16, q5s16);
-    q14s16 = vaddq_s16(q6s16, q5s16);
-    q15s16 = vaddq_s16(q7s16, q4s16);
-
-    // stage 6
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-
-    d14s16 = vdup_n_s16(cospi_16_64);
-
-    q3s32 = vmull_s16(d26s16, d14s16);
-    q4s32 = vmull_s16(d27s16, d14s16);
-    q0s32 = vmull_s16(d20s16, d14s16);
-    q1s32 = vmull_s16(d21s16, d14s16);
-
-    q5s32 = vsubq_s32(q3s32, q0s32);
-    q6s32 = vsubq_s32(q4s32, q1s32);
-    q10s32 = vaddq_s32(q3s32, q0s32);
-    q4s32 = vaddq_s32(q4s32, q1s32);
-
-    d4s16 = vqrshrn_n_s32(q5s32, 14);
-    d5s16 = vqrshrn_n_s32(q6s32, 14);
-    d10s16 = vqrshrn_n_s32(q10s32, 14);
-    d11s16 = vqrshrn_n_s32(q4s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    q0s32 = vmull_s16(d22s16, d14s16);
-    q1s32 = vmull_s16(d23s16, d14s16);
-    q13s32 = vmull_s16(d24s16, d14s16);
-    q6s32 = vmull_s16(d25s16, d14s16);
-
-    q10s32 = vsubq_s32(q13s32, q0s32);
-    q4s32 = vsubq_s32(q6s32, q1s32);
-    q13s32 = vaddq_s32(q13s32, q0s32);
-    q6s32 = vaddq_s32(q6s32, q1s32);
-
-    d6s16 = vqrshrn_n_s32(q10s32, 14);
-    d7s16 = vqrshrn_n_s32(q4s32, 14);
-    d8s16 = vqrshrn_n_s32(q13s32, 14);
-    d9s16 = vqrshrn_n_s32(q6s32, 14);
-    q3s16 = vcombine_s16(d6s16, d7s16);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-
-    // stage 7
-    if (skip_adding != 0) {
-        d = dest;
-        // load the data in pass1
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-
-        q12s16 = vaddq_s16(q0s16, q15s16);
-        q13s16 = vaddq_s16(q1s16, q14s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q14s16 = vsubq_s16(q1s16, q14s16);
-        q15s16 = vsubq_s16(q0s16, q15s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q12s16 = vaddq_s16(q10s16, q5s16);
-        q13s16 = vaddq_s16(q11s16, q4s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q4s16 = vsubq_s16(q11s16, q4s16);
-        q5s16 = vsubq_s16(q10s16, q5s16);
-
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q12s16 = vaddq_s16(q0s16, q3s16);
-        q13s16 = vaddq_s16(q1s16, q2s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q2s16 = vsubq_s16(q1s16, q2s16);
-        q3s16 = vsubq_s16(q0s16, q3s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        d13s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q12s16 = vaddq_s16(q10s16, q9s16);
-        q13s16 = vaddq_s16(q11s16, q8s16);
-        q12s16 = vrshrq_n_s16(q12s16, 6);
-        q13s16 = vrshrq_n_s16(q13s16, 6);
-        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
-                          vreinterpret_u8_s64(d12s64));
-        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
-                          vreinterpret_u8_s64(d13s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-        d += dest_stride;
-        q8s16 = vsubq_s16(q11s16, q8s16);
-        q9s16 = vsubq_s16(q10s16, q9s16);
-
-        // store the data  out 8,9,10,11,12,13,14,15
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q8s16 = vrshrq_n_s16(q8s16, 6);
-        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q9s16 = vrshrq_n_s16(q9s16, 6);
-        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q2s16 = vrshrq_n_s16(q2s16, 6);
-        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q3s16 = vrshrq_n_s16(q3s16, 6);
-        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q4s16 = vrshrq_n_s16(q4s16, 6);
-        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q5s16 = vrshrq_n_s16(q5s16, 6);
-        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
-                         vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        dest += dest_stride;
-        q14s16 = vrshrq_n_s16(q14s16, 6);
-        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-        d += dest_stride;
-
-        d12s64 = vld1_s64((int64_t *)dest);
-        q15s16 = vrshrq_n_s16(q15s16, 6);
-        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
-                          vreinterpret_u8_s64(d12s64));
-        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
-        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    } else {  // skip_adding_dest
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q0s16, q15s16);
-        q13s16 = vaddq_s16(q1s16, q14s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q14s16 = vsubq_s16(q1s16, q14s16);
-        q15s16 = vsubq_s16(q0s16, q15s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q10s16, q5s16);
-        q13s16 = vaddq_s16(q11s16, q4s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q4s16 = vsubq_s16(q11s16, q4s16);
-        q5s16 = vsubq_s16(q10s16, q5s16);
-
-        q0s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q1s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q0s16, q3s16);
-        q13s16 = vaddq_s16(q1s16, q2s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q2s16 = vsubq_s16(q1s16, q2s16);
-        q3s16 = vsubq_s16(q0s16, q3s16);
-
-        q10s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q11s16 = vld1q_s16(pass1Output);
-        pass1Output += 8;
-        q12s16 = vaddq_s16(q10s16, q9s16);
-        q13s16 = vaddq_s16(q11s16, q8s16);
-        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-        vst1_u64((uint64_t *)out, d24u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d25u64);
-        out += 12;
-        vst1_u64((uint64_t *)out, d26u64);
-        out += 4;
-        vst1_u64((uint64_t *)out, d27u64);
-        out += 12;
-        q8s16 = vsubq_s16(q11s16, q8s16);
-        q9s16 = vsubq_s16(q10s16, q9s16);
-
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
-        out += 12;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
-        out += 4;
-        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
-    }
-    return;
-}
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q12s16 = vaddq_s16(q0s16, q3s16);
+    q13s16 = vaddq_s16(q1s16, q2s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q2s16 = vsubq_s16(q1s16, q2s16);
+    q3s16 = vsubq_s16(q0s16, q3s16);
 
-void vpx_idct16x16_10_add_neon_pass1(
-        int16_t *in,
-        int16_t *out,
-        int output_stride) {
-    int16x4_t d4s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q6s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q15s32;
-    int16x8x2_t q0x2s16;
-
-    q0x2s16 = vld2q_s16(in);
-    q8s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q9s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q10s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q11s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q12s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q13s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q14s16 = q0x2s16.val[0];
-    in += 16;
-    q0x2s16 = vld2q_s16(in);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    // stage 3
-    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
-    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-    // stage 4
-    q1s16 = vdupq_n_s16(cospi_16_64 * 2);
-    d4s16 = vdup_n_s16(cospi_16_64);
-
-    q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
-    d8s16 = vget_low_s16(q4s16);
-    d9s16 = vget_high_s16(q4s16);
-    d14s16 = vget_low_s16(q7s16);
-    d15s16 = vget_high_s16(q7s16);
-    q9s32  = vmull_s16(d14s16, d4s16);
-    q10s32 = vmull_s16(d15s16, d4s16);
-    q12s32 = vmull_s16(d9s16, d4s16);
-    q11s32 = vmull_s16(d8s16, d4s16);
-
-    q15s32 = vsubq_s32(q10s32, q12s32);
-    q6s32 = vsubq_s32(q9s32, q11s32);
-    q9s32 = vaddq_s32(q9s32, q11s32);
-    q10s32 = vaddq_s32(q10s32, q12s32);
-
-    d11s16 = vqrshrn_n_s32(q15s32, 14);
-    d10s16 = vqrshrn_n_s32(q6s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q10s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 6
-    q2s16 = vaddq_s16(q8s16, q7s16);
-    q9s16 = vaddq_s16(q8s16, q6s16);
-    q10s16 = vaddq_s16(q8s16, q5s16);
-    q11s16 = vaddq_s16(q8s16, q4s16);
-    q12s16 = vsubq_s16(q8s16, q4s16);
-    q13s16 = vsubq_s16(q8s16, q5s16);
-    q14s16 = vsubq_s16(q8s16, q6s16);
-    q15s16 = vsubq_s16(q8s16, q7s16);
-
-    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-    // store the data
-    output_stride >>= 1;  // output_stride / 2, out is int16_t
-    vst1_u64((uint64_t *)out, d4u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d5u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d18u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d19u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d20u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d21u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d22u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d23u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d24u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d28u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d29u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d30u64);
-    out += output_stride;
-    vst1_u64((uint64_t *)out, d31u64);
-    return;
-}
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    d13s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q12s16 = vaddq_s16(q10s16, q9s16);
+    q13s16 = vaddq_s16(q11s16, q8s16);
+    q12s16 = vrshrq_n_s16(q12s16, 6);
+    q13s16 = vrshrq_n_s16(q13s16, 6);
+    q12u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
+    q13u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+    d += dest_stride;
+    q8s16 = vsubq_s16(q11s16, q8s16);
+    q9s16 = vsubq_s16(q10s16, q9s16);
 
-void vpx_idct16x16_10_add_neon_pass2(
-        int16_t *src,
-        int16_t *out,
-        int16_t *pass1Output,
-        int16_t skip_adding,
-        uint8_t *dest,
-        int dest_stride) {
-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
-    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
-    uint64x1_t d16u64, d17u64, d18u64, d19u64;
-    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32;
-    int16x8x2_t q0x2s16;
-    (void)skip_adding;
-    (void)dest;
-    (void)dest_stride;
-
-    q0x2s16 = vld2q_s16(src);
-    q8s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q9s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q10s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q11s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q12s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q13s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q14s16 = q0x2s16.val[0];
-    src += 16;
-    q0x2s16 = vld2q_s16(src);
-    q15s16 = q0x2s16.val[0];
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    // stage 3
-    q6s16 = vdupq_n_s16(cospi_30_64 * 2);
-    q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-    q6s16 = vdupq_n_s16(cospi_2_64 * 2);
-    q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
-    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
-    q14s16 = vdupq_n_s16(cospi_6_64 * 2);
-    q3s16 = vqrdmulhq_s16(q9s16, q15s16);
-    q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
-    // stage 4
-    d0s16 = vget_low_s16(q0s16);
-    d1s16 = vget_high_s16(q0s16);
-    d6s16 = vget_low_s16(q3s16);
-    d7s16 = vget_high_s16(q3s16);
-    d8s16 = vget_low_s16(q4s16);
-    d9s16 = vget_high_s16(q4s16);
-    d14s16 = vget_low_s16(q7s16);
-    d15s16 = vget_high_s16(q7s16);
-
-    d30s16 = vdup_n_s16(cospi_8_64);
-    d31s16 = vdup_n_s16(cospi_24_64);
-
-    q12s32 = vmull_s16(d14s16, d31s16);
-    q5s32 = vmull_s16(d15s16, d31s16);
-    q2s32 = vmull_s16(d0s16, d31s16);
-    q11s32 = vmull_s16(d1s16, d31s16);
-
-    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
-    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
-    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
-    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
-    d2s16 = vqrshrn_n_s32(q12s32, 14);
-    d3s16 = vqrshrn_n_s32(q5s32, 14);
-    d12s16 = vqrshrn_n_s32(q2s32, 14);
-    d13s16 = vqrshrn_n_s32(q11s32, 14);
-    q1s16 = vcombine_s16(d2s16, d3s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    d30s16 = vdup_n_s16(-cospi_8_64);
-    q10s32 = vmull_s16(d8s16, d30s16);
-    q13s32 = vmull_s16(d9s16, d30s16);
-    q8s32 = vmull_s16(d6s16, d30s16);
-    q9s32 = vmull_s16(d7s16, d30s16);
-
-    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
-    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
-    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
-    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
-    d4s16 = vqrshrn_n_s32(q10s32, 14);
-    d5s16 = vqrshrn_n_s32(q13s32, 14);
-    d10s16 = vqrshrn_n_s32(q8s32, 14);
-    d11s16 = vqrshrn_n_s32(q9s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    // stage 5
-    q8s16  = vaddq_s16(q0s16, q3s16);
-    q9s16  = vaddq_s16(q1s16, q2s16);
-    q10s16 = vsubq_s16(q1s16, q2s16);
-    q11s16 = vsubq_s16(q0s16, q3s16);
-    q12s16 = vsubq_s16(q7s16, q4s16);
-    q13s16 = vsubq_s16(q6s16, q5s16);
-    q14s16 = vaddq_s16(q6s16, q5s16);
-    q15s16 = vaddq_s16(q7s16, q4s16);
-
-    // stage 6
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-
-    d14s16 = vdup_n_s16(cospi_16_64);
-    q3s32 = vmull_s16(d26s16, d14s16);
-    q4s32 = vmull_s16(d27s16, d14s16);
-    q0s32 = vmull_s16(d20s16, d14s16);
-    q1s32 = vmull_s16(d21s16, d14s16);
-
-    q5s32 = vsubq_s32(q3s32, q0s32);
-    q6s32 = vsubq_s32(q4s32, q1s32);
-    q0s32 = vaddq_s32(q3s32, q0s32);
-    q4s32 = vaddq_s32(q4s32, q1s32);
-
-    d4s16 = vqrshrn_n_s32(q5s32, 14);
-    d5s16 = vqrshrn_n_s32(q6s32, 14);
-    d10s16 = vqrshrn_n_s32(q0s32, 14);
-    d11s16 = vqrshrn_n_s32(q4s32, 14);
-    q2s16 = vcombine_s16(d4s16, d5s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    q0s32 = vmull_s16(d22s16, d14s16);
-    q1s32 = vmull_s16(d23s16, d14s16);
-    q13s32 = vmull_s16(d24s16, d14s16);
-    q6s32 = vmull_s16(d25s16, d14s16);
-
-    q10s32 = vsubq_s32(q13s32, q0s32);
-    q4s32 = vsubq_s32(q6s32, q1s32);
-    q13s32 = vaddq_s32(q13s32, q0s32);
-    q6s32 = vaddq_s32(q6s32, q1s32);
-
-    d6s16 = vqrshrn_n_s32(q10s32, 14);
-    d7s16 = vqrshrn_n_s32(q4s32, 14);
-    d8s16 = vqrshrn_n_s32(q13s32, 14);
-    d9s16 = vqrshrn_n_s32(q6s32, 14);
-    q3s16 = vcombine_s16(d6s16, d7s16);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-
-    // stage 7
+    // store the data  out 8,9,10,11,12,13,14,15
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q8s16 = vrshrq_n_s16(q8s16, 6);
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q9s16 = vrshrq_n_s16(q9s16, 6);
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q2s16 = vrshrq_n_s16(q2s16, 6);
+    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q3s16 = vrshrq_n_s16(q3s16, 6);
+    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q4s16 = vrshrq_n_s16(q4s16, 6);
+    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q5s16 = vrshrq_n_s16(q5s16, 6);
+    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    dest += dest_stride;
+    q14s16 = vrshrq_n_s16(q14s16, 6);
+    q14u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    d += dest_stride;
+
+    d12s64 = vld1_s64((int64_t *)dest);
+    q15s16 = vrshrq_n_s16(q15s16, 6);
+    q15u16 =
+        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
+    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+  } else {  // skip_adding_dest
     q0s16 = vld1q_s16(pass1Output);
     pass1Output += 8;
     q1s16 = vld1q_s16(pass1Output);
@@ -1248,6 +809,7 @@ void vpx_idct16x16_10_add_neon_pass2(
     q10s16 = vld1q_s16(pass1Output);
     pass1Output += 8;
     q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
     q12s16 = vaddq_s16(q10s16, q9s16);
     q13s16 = vaddq_s16(q11s16, q8s16);
     d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
@@ -1265,53 +827,468 @@ void vpx_idct16x16_10_add_neon_pass2(
     q8s16 = vsubq_s16(q11s16, q8s16);
     q9s16 = vsubq_s16(q10s16, q9s16);
 
-    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));
-    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));
-    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));
-    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));
-    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));
-    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));
-    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
-    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
-    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-    vst1_u64((uint64_t *)out, d16u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d17u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
     out += 12;
-    vst1_u64((uint64_t *)out, d18u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d19u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
     out += 12;
-    vst1_u64((uint64_t *)out, d4u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d5u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
     out += 12;
-    vst1_u64((uint64_t *)out, d6u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d7u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
     out += 12;
-    vst1_u64((uint64_t *)out, d8u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d9u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
     out += 12;
-    vst1_u64((uint64_t *)out, d10u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d11u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
     out += 12;
-    vst1_u64((uint64_t *)out, d28u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d29u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
     out += 12;
-    vst1_u64((uint64_t *)out, d30u64);
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
     out += 4;
-    vst1_u64((uint64_t *)out, d31u64);
-    return;
+    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+  }
+  return;
+}
+
+void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
+                                     int output_stride) {
+  int16x4_t d4s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+  int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q6s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q15s32;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = vld2q_s16(in);
+  q8s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q9s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q10s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q11s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q12s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q13s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q14s16 = q0x2s16.val[0];
+  in += 16;
+  q0x2s16 = vld2q_s16(in);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  // stage 3
+  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+  // stage 4
+  q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+  d4s16 = vdup_n_s16(cospi_16_64);
+
+  q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+  d8s16 = vget_low_s16(q4s16);
+  d9s16 = vget_high_s16(q4s16);
+  d14s16 = vget_low_s16(q7s16);
+  d15s16 = vget_high_s16(q7s16);
+  q9s32 = vmull_s16(d14s16, d4s16);
+  q10s32 = vmull_s16(d15s16, d4s16);
+  q12s32 = vmull_s16(d9s16, d4s16);
+  q11s32 = vmull_s16(d8s16, d4s16);
+
+  q15s32 = vsubq_s32(q10s32, q12s32);
+  q6s32 = vsubq_s32(q9s32, q11s32);
+  q9s32 = vaddq_s32(q9s32, q11s32);
+  q10s32 = vaddq_s32(q10s32, q12s32);
+
+  d11s16 = vqrshrn_n_s32(q15s32, 14);
+  d10s16 = vqrshrn_n_s32(q6s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q10s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 6
+  q2s16 = vaddq_s16(q8s16, q7s16);
+  q9s16 = vaddq_s16(q8s16, q6s16);
+  q10s16 = vaddq_s16(q8s16, q5s16);
+  q11s16 = vaddq_s16(q8s16, q4s16);
+  q12s16 = vsubq_s16(q8s16, q4s16);
+  q13s16 = vsubq_s16(q8s16, q5s16);
+  q14s16 = vsubq_s16(q8s16, q6s16);
+  q15s16 = vsubq_s16(q8s16, q7s16);
+
+  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+  // store the data
+  output_stride >>= 1;  // output_stride / 2, out is int16_t
+  vst1_u64((uint64_t *)out, d4u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d5u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d18u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d19u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d20u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d21u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d22u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d23u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d24u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d28u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d29u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d30u64);
+  out += output_stride;
+  vst1_u64((uint64_t *)out, d31u64);
+  return;
+}
+
+void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
+                                     int16_t *pass1Output, int16_t skip_adding,
+                                     uint8_t *dest, int dest_stride) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+  uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+  uint64x1_t d16u64, d17u64, d18u64, d19u64;
+  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32;
+  int16x8x2_t q0x2s16;
+  (void)skip_adding;
+  (void)dest;
+  (void)dest_stride;
+
+  q0x2s16 = vld2q_s16(src);
+  q8s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q9s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q10s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q11s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q12s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q13s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q14s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q15s16 = q0x2s16.val[0];
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  // stage 3
+  q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+  q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+  q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+  q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+  q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+  q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+  q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+  q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+  // stage 4
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+  d6s16 = vget_low_s16(q3s16);
+  d7s16 = vget_high_s16(q3s16);
+  d8s16 = vget_low_s16(q4s16);
+  d9s16 = vget_high_s16(q4s16);
+  d14s16 = vget_low_s16(q7s16);
+  d15s16 = vget_high_s16(q7s16);
+
+  d30s16 = vdup_n_s16(cospi_8_64);
+  d31s16 = vdup_n_s16(cospi_24_64);
+
+  q12s32 = vmull_s16(d14s16, d31s16);
+  q5s32 = vmull_s16(d15s16, d31s16);
+  q2s32 = vmull_s16(d0s16, d31s16);
+  q11s32 = vmull_s16(d1s16, d31s16);
+
+  q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+  q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+  q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+  q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+  d2s16 = vqrshrn_n_s32(q12s32, 14);
+  d3s16 = vqrshrn_n_s32(q5s32, 14);
+  d12s16 = vqrshrn_n_s32(q2s32, 14);
+  d13s16 = vqrshrn_n_s32(q11s32, 14);
+  q1s16 = vcombine_s16(d2s16, d3s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  d30s16 = vdup_n_s16(-cospi_8_64);
+  q10s32 = vmull_s16(d8s16, d30s16);
+  q13s32 = vmull_s16(d9s16, d30s16);
+  q8s32 = vmull_s16(d6s16, d30s16);
+  q9s32 = vmull_s16(d7s16, d30s16);
+
+  q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+  q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+  q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+  q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+  d4s16 = vqrshrn_n_s32(q10s32, 14);
+  d5s16 = vqrshrn_n_s32(q13s32, 14);
+  d10s16 = vqrshrn_n_s32(q8s32, 14);
+  d11s16 = vqrshrn_n_s32(q9s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  // stage 5
+  q8s16 = vaddq_s16(q0s16, q3s16);
+  q9s16 = vaddq_s16(q1s16, q2s16);
+  q10s16 = vsubq_s16(q1s16, q2s16);
+  q11s16 = vsubq_s16(q0s16, q3s16);
+  q12s16 = vsubq_s16(q7s16, q4s16);
+  q13s16 = vsubq_s16(q6s16, q5s16);
+  q14s16 = vaddq_s16(q6s16, q5s16);
+  q15s16 = vaddq_s16(q7s16, q4s16);
+
+  // stage 6
+  d20s16 = vget_low_s16(q10s16);
+  d21s16 = vget_high_s16(q10s16);
+  d22s16 = vget_low_s16(q11s16);
+  d23s16 = vget_high_s16(q11s16);
+  d24s16 = vget_low_s16(q12s16);
+  d25s16 = vget_high_s16(q12s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+
+  d14s16 = vdup_n_s16(cospi_16_64);
+  q3s32 = vmull_s16(d26s16, d14s16);
+  q4s32 = vmull_s16(d27s16, d14s16);
+  q0s32 = vmull_s16(d20s16, d14s16);
+  q1s32 = vmull_s16(d21s16, d14s16);
+
+  q5s32 = vsubq_s32(q3s32, q0s32);
+  q6s32 = vsubq_s32(q4s32, q1s32);
+  q0s32 = vaddq_s32(q3s32, q0s32);
+  q4s32 = vaddq_s32(q4s32, q1s32);
+
+  d4s16 = vqrshrn_n_s32(q5s32, 14);
+  d5s16 = vqrshrn_n_s32(q6s32, 14);
+  d10s16 = vqrshrn_n_s32(q0s32, 14);
+  d11s16 = vqrshrn_n_s32(q4s32, 14);
+  q2s16 = vcombine_s16(d4s16, d5s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q0s32 = vmull_s16(d22s16, d14s16);
+  q1s32 = vmull_s16(d23s16, d14s16);
+  q13s32 = vmull_s16(d24s16, d14s16);
+  q6s32 = vmull_s16(d25s16, d14s16);
+
+  q10s32 = vsubq_s32(q13s32, q0s32);
+  q4s32 = vsubq_s32(q6s32, q1s32);
+  q13s32 = vaddq_s32(q13s32, q0s32);
+  q6s32 = vaddq_s32(q6s32, q1s32);
+
+  d6s16 = vqrshrn_n_s32(q10s32, 14);
+  d7s16 = vqrshrn_n_s32(q4s32, 14);
+  d8s16 = vqrshrn_n_s32(q13s32, 14);
+  d9s16 = vqrshrn_n_s32(q6s32, 14);
+  q3s16 = vcombine_s16(d6s16, d7s16);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+
+  // stage 7
+  q0s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q1s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q12s16 = vaddq_s16(q0s16, q15s16);
+  q13s16 = vaddq_s16(q1s16, q14s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q14s16 = vsubq_s16(q1s16, q14s16);
+  q15s16 = vsubq_s16(q0s16, q15s16);
+
+  q10s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q11s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q12s16 = vaddq_s16(q10s16, q5s16);
+  q13s16 = vaddq_s16(q11s16, q4s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q4s16 = vsubq_s16(q11s16, q4s16);
+  q5s16 = vsubq_s16(q10s16, q5s16);
+
+  q0s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q1s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q12s16 = vaddq_s16(q0s16, q3s16);
+  q13s16 = vaddq_s16(q1s16, q2s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q2s16 = vsubq_s16(q1s16, q2s16);
+  q3s16 = vsubq_s16(q0s16, q3s16);
+
+  q10s16 = vld1q_s16(pass1Output);
+  pass1Output += 8;
+  q11s16 = vld1q_s16(pass1Output);
+  q12s16 = vaddq_s16(q10s16, q9s16);
+  q13s16 = vaddq_s16(q11s16, q8s16);
+  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+  vst1_u64((uint64_t *)out, d24u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d25u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d26u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d27u64);
+  out += 12;
+  q8s16 = vsubq_s16(q11s16, q8s16);
+  q9s16 = vsubq_s16(q10s16, q9s16);
+
+  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+  d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
+  d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
+  d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
+  d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
+  d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+  d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+  vst1_u64((uint64_t *)out, d16u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d17u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d18u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d19u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d4u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d5u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d6u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d7u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d8u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d9u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d10u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d11u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d28u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d29u64);
+  out += 12;
+  vst1_u64((uint64_t *)out, d30u64);
+  out += 4;
+  vst1_u64((uint64_t *)out, d31u64);
+  return;
 }
diff --git a/vpx_dsp/arm/idct16x16_neon.c b/vpx_dsp/arm/idct16x16_neon.c
index 352979aa16f7613c4a4d6c176095921be08a61a5..ecc263df28445be1e3d94b5d85126dd6c82062a5 100644
--- a/vpx_dsp/arm/idct16x16_neon.c
+++ b/vpx_dsp/arm/idct16x16_neon.c
@@ -10,24 +10,16 @@
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,
-                                      int16_t *output,
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
                                       int output_stride);
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
-                                      int16_t *output,
-                                      int16_t *pass1Output,
-                                      int16_t skip_adding,
-                                      uint8_t *dest,
-                                      int dest_stride);
-void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,
-                                     int16_t *output,
+void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
+                                      int16_t *pass1Output, int16_t skip_adding,
+                                      uint8_t *dest, int dest_stride);
+void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
                                      int output_stride);
-void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,
-                                     int16_t *output,
-                                     int16_t *pass1Output,
-                                     int16_t skip_adding,
-                                     uint8_t *dest,
-                                     int dest_stride);
+void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
+                                     int16_t *pass1Output, int16_t skip_adding,
+                                     uint8_t *dest, int dest_stride);
 
 #if HAVE_NEON_ASM
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
@@ -35,13 +27,13 @@ extern void vpx_push_neon(int64_t *store);
 extern void vpx_pop_neon(int64_t *store);
 #endif  // HAVE_NEON_ASM
 
-void vpx_idct16x16_256_add_neon(const int16_t *input,
-                                uint8_t *dest, int dest_stride) {
+void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
 #if HAVE_NEON_ASM
   int64_t store_reg[8];
 #endif
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
+  int16_t pass1_output[16 * 16] = { 0 };
+  int16_t row_idct_output[16 * 16] = { 0 };
 
 #if HAVE_NEON_ASM
   // save d8-d15 register values.
@@ -56,27 +48,19 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
+  vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+                                   dest, dest_stride);
 
   /* Parallel idct on the lower 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     0,
-                                     dest,
-                                     dest_stride);
+  vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
+                                   pass1_output, 0, dest, dest_stride);
 
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -86,27 +70,20 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+                                   pass1_output, 1, dest, dest_stride);
 
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+                                   row_idct_output + 8, pass1_output, 1,
+                                   dest + 8, dest_stride);
 
 #if HAVE_NEON_ASM
   // restore d8-d15 register values.
@@ -116,13 +93,13 @@ void vpx_idct16x16_256_add_neon(const int16_t *input,
   return;
 }
 
-void vpx_idct16x16_10_add_neon(const int16_t *input,
-                               uint8_t *dest, int dest_stride) {
+void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
 #if HAVE_NEON_ASM
   int64_t store_reg[8];
 #endif
-  int16_t pass1_output[16*16] = {0};
-  int16_t row_idct_output[16*16] = {0};
+  int16_t pass1_output[16 * 16] = { 0 };
+  int16_t row_idct_output[16 * 16] = { 0 };
 
 #if HAVE_NEON_ASM
   // save d8-d15 register values.
@@ -137,12 +114,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_10_add_neon_pass2(input+1,
-                                        row_idct_output,
-                                        pass1_output,
-                                        0,
-                                        dest,
-                                        dest_stride);
+  vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+                                  dest, dest_stride);
 
   /* Skip Parallel idct on the lower 8 rows as they are all 0s */
 
@@ -154,27 +127,20 @@ void vpx_idct16x16_10_add_neon(const int16_t *input,
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,
-                                     row_idct_output,
-                                     pass1_output,
-                                     1,
-                                     dest,
-                                     dest_stride);
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+                                   pass1_output, 1, dest, dest_stride);
 
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
-                                     row_idct_output+8,
-                                     pass1_output,
-                                     1,
-                                     dest+8,
-                                     dest_stride);
+  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+                                   row_idct_output + 8, pass1_output, 1,
+                                   dest + 8, dest_stride);
 
 #if HAVE_NEON_ASM
   // restore d8-d15 register values.
diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c
index c25c0c4a5c208b4e758793e71fa6bdd40633e078..dab7d098e8abd82f30c1b4b6d43d69f7e3826f98 100644
--- a/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -15,151 +15,126 @@
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void LD_16x8(
-        uint8_t *d,
-        int d_stride,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    *q8u8 = vld1q_u8(d);
-    d += d_stride;
-    *q9u8 = vld1q_u8(d);
-    d += d_stride;
-    *q10u8 = vld1q_u8(d);
-    d += d_stride;
-    *q11u8 = vld1q_u8(d);
-    d += d_stride;
-    *q12u8 = vld1q_u8(d);
-    d += d_stride;
-    *q13u8 = vld1q_u8(d);
-    d += d_stride;
-    *q14u8 = vld1q_u8(d);
-    d += d_stride;
-    *q15u8 = vld1q_u8(d);
-    return;
+static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+                           uint8x16_t *q9u8, uint8x16_t *q10u8,
+                           uint8x16_t *q11u8, uint8x16_t *q12u8,
+                           uint8x16_t *q13u8, uint8x16_t *q14u8,
+                           uint8x16_t *q15u8) {
+  *q8u8 = vld1q_u8(d);
+  d += d_stride;
+  *q9u8 = vld1q_u8(d);
+  d += d_stride;
+  *q10u8 = vld1q_u8(d);
+  d += d_stride;
+  *q11u8 = vld1q_u8(d);
+  d += d_stride;
+  *q12u8 = vld1q_u8(d);
+  d += d_stride;
+  *q13u8 = vld1q_u8(d);
+  d += d_stride;
+  *q14u8 = vld1q_u8(d);
+  d += d_stride;
+  *q15u8 = vld1q_u8(d);
+  return;
 }
 
-static INLINE void ADD_DIFF_16x8(
-        uint8x16_t qdiffu8,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
-    *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
-    *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
-    *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
-    *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
-    *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
-    *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
-    *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
-    return;
+static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
+                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
+                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
+                                 uint8x16_t *q15u8) {
+  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+  return;
 }
 
-static INLINE void SUB_DIFF_16x8(
-        uint8x16_t qdiffu8,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
-    *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
-    *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
-    *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
-    *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
-    *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
-    *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
-    *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-    return;
+static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
+                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
+                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
+                                 uint8x16_t *q15u8) {
+  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+  return;
 }
 
-static INLINE void ST_16x8(
-        uint8_t *d,
-        int d_stride,
-        uint8x16_t *q8u8,
-        uint8x16_t *q9u8,
-        uint8x16_t *q10u8,
-        uint8x16_t *q11u8,
-        uint8x16_t *q12u8,
-        uint8x16_t *q13u8,
-        uint8x16_t *q14u8,
-        uint8x16_t *q15u8) {
-    vst1q_u8(d, *q8u8);
-    d += d_stride;
-    vst1q_u8(d, *q9u8);
-    d += d_stride;
-    vst1q_u8(d, *q10u8);
-    d += d_stride;
-    vst1q_u8(d, *q11u8);
-    d += d_stride;
-    vst1q_u8(d, *q12u8);
-    d += d_stride;
-    vst1q_u8(d, *q13u8);
-    d += d_stride;
-    vst1q_u8(d, *q14u8);
-    d += d_stride;
-    vst1q_u8(d, *q15u8);
-    return;
+static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+                           uint8x16_t *q9u8, uint8x16_t *q10u8,
+                           uint8x16_t *q11u8, uint8x16_t *q12u8,
+                           uint8x16_t *q13u8, uint8x16_t *q14u8,
+                           uint8x16_t *q15u8) {
+  vst1q_u8(d, *q8u8);
+  d += d_stride;
+  vst1q_u8(d, *q9u8);
+  d += d_stride;
+  vst1q_u8(d, *q10u8);
+  d += d_stride;
+  vst1q_u8(d, *q11u8);
+  d += d_stride;
+  vst1q_u8(d, *q12u8);
+  d += d_stride;
+  vst1q_u8(d, *q13u8);
+  d += d_stride;
+  vst1q_u8(d, *q14u8);
+  d += d_stride;
+  vst1q_u8(d, *q15u8);
+  return;
 }
 
-void vpx_idct32x32_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-    int i, j, dest_stride8;
-    uint8_t *d;
-    int16_t a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+  int i, j, dest_stride8;
+  uint8_t *d;
+  int16_t a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
 
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 6);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
 
-    dest_stride8 = dest_stride * 8;
-    if (a1 >= 0) {  // diff_positive_32_32
-        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-        q0u8 = vdupq_n_u8(a1);
-        for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
-            d = dest;
-            for (j = 0; j < 4; j++) {
-                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
-                                    &q12u8, &q13u8, &q14u8, &q15u8);
-                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                d += dest_stride8;
-            }
-        }
-    } else {  // diff_negative_32_32
-        a1 = -a1;
-        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-        q0u8 = vdupq_n_u8(a1);
-        for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
-            d = dest;
-            for (j = 0; j < 4; j++) {
-                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
-                                    &q12u8, &q13u8, &q14u8, &q15u8);
-                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
-                                        &q12u8, &q13u8, &q14u8, &q15u8);
-                d += dest_stride8;
-            }
-        }
+  dest_stride8 = dest_stride * 8;
+  if (a1 >= 0) {  // diff_positive_32_32
+    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+    q0u8 = vdupq_n_u8(a1);
+    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
+      d = dest;
+      for (j = 0; j < 4; j++) {
+        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                      &q14u8, &q15u8);
+        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        d += dest_stride8;
+      }
     }
-    return;
+  } else {  // diff_negative_32_32
+    a1 = -a1;
+    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+    q0u8 = vdupq_n_u8(a1);
+    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
+      d = dest;
+      for (j = 0; j < 4; j++) {
+        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                      &q14u8, &q15u8);
+        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        d += dest_stride8;
+      }
+    }
+  }
+  return;
 }
diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c
index 025437eb963bc044a116e46f05be4d648a36064c..88b3d0109afdadf5f725b955ae0883cc673988f8 100644
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -14,706 +14,672 @@
 #include "vpx_dsp/txfm_common.h"
 
 #define LOAD_FROM_TRANSPOSED(prev, first, second) \
-    q14s16 = vld1q_s16(trans_buf + first * 8); \
-    q13s16 = vld1q_s16(trans_buf + second * 8);
+  q14s16 = vld1q_s16(trans_buf + first * 8);      \
+  q13s16 = vld1q_s16(trans_buf + second * 8);
 
 #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
-    qA = vld1q_s16(out + first * 32); \
-    qB = vld1q_s16(out + second * 32);
+  qA = vld1q_s16(out + first * 32);                   \
+  qB = vld1q_s16(out + second * 32);
 
 #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
-    vst1q_s16(out + first * 32, qA); \
-    vst1q_s16(out + second * 32, qB);
-
-#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \
-       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
-                                      q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(
-        uint8_t *p1,
-        uint8_t *p2,
-        int stride,
-        int16x8_t q6s16,
-        int16x8_t q7s16,
-        int16x8_t q8s16,
-        int16x8_t q9s16) {
-    int16x4_t d8s16, d9s16, d10s16, d11s16;
-
-    d8s16 = vld1_s16((int16_t *)p1);
-    p1 += stride;
-    d11s16 = vld1_s16((int16_t *)p2);
-    p2 -= stride;
-    d9s16 = vld1_s16((int16_t *)p1);
-    d10s16 = vld1_s16((int16_t *)p2);
-
-    q7s16 = vrshrq_n_s16(q7s16, 6);
-    q8s16 = vrshrq_n_s16(q8s16, 6);
-    q9s16 = vrshrq_n_s16(q9s16, 6);
-    q6s16 = vrshrq_n_s16(q6s16, 6);
-
-    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
-                                           vreinterpret_u8_s16(d9s16)));
-    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                                           vreinterpret_u8_s16(d10s16)));
-    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                                           vreinterpret_u8_s16(d11s16)));
-    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
-                                           vreinterpret_u8_s16(d8s16)));
-
-    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
-    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
-    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-
-    vst1_s16((int16_t *)p1, d9s16);
-    p1 -= stride;
-    vst1_s16((int16_t *)p2, d10s16);
-    p2 += stride;
-    vst1_s16((int16_t *)p1, d8s16);
-    vst1_s16((int16_t *)p2, d11s16);
-    return;
+  vst1q_s16(out + first * 32, qA);                   \
+  vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
+                                                  int stride, int16x8_t q6s16,
+                                                  int16x8_t q7s16,
+                                                  int16x8_t q8s16,
+                                                  int16x8_t q9s16) {
+  int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+  d8s16 = vld1_s16((int16_t *)p1);
+  p1 += stride;
+  d11s16 = vld1_s16((int16_t *)p2);
+  p2 -= stride;
+  d9s16 = vld1_s16((int16_t *)p1);
+  d10s16 = vld1_s16((int16_t *)p2);
+
+  q7s16 = vrshrq_n_s16(q7s16, 6);
+  q8s16 = vrshrq_n_s16(q8s16, 6);
+  q9s16 = vrshrq_n_s16(q9s16, 6);
+  q6s16 = vrshrq_n_s16(q6s16, 6);
+
+  q7s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
+  q8s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
+  q9s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
+  q6s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
+
+  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+  vst1_s16((int16_t *)p1, d9s16);
+  p1 -= stride;
+  vst1_s16((int16_t *)p2, d10s16);
+  p2 += stride;
+  vst1_s16((int16_t *)p1, d8s16);
+  vst1_s16((int16_t *)p2, d11s16);
+  return;
 }
 
-#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
-       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
-                                      q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
-        uint8_t *p1,
-        uint8_t *p2,
-        int stride,
-        int16x8_t q4s16,
-        int16x8_t q5s16,
-        int16x8_t q6s16,
-        int16x8_t q7s16) {
-    int16x4_t d4s16, d5s16, d6s16, d7s16;
-
-    d4s16 = vld1_s16((int16_t *)p1);
-    p1 += stride;
-    d7s16 = vld1_s16((int16_t *)p2);
-    p2 -= stride;
-    d5s16 = vld1_s16((int16_t *)p1);
-    d6s16 = vld1_s16((int16_t *)p2);
-
-    q5s16 = vrshrq_n_s16(q5s16, 6);
-    q6s16 = vrshrq_n_s16(q6s16, 6);
-    q7s16 = vrshrq_n_s16(q7s16, 6);
-    q4s16 = vrshrq_n_s16(q4s16, 6);
-
-    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
-                                           vreinterpret_u8_s16(d5s16)));
-    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
-                                           vreinterpret_u8_s16(d6s16)));
-    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
-                                           vreinterpret_u8_s16(d7s16)));
-    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
-                                           vreinterpret_u8_s16(d4s16)));
-
-    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
-    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
-
-    vst1_s16((int16_t *)p1, d5s16);
-    p1 -= stride;
-    vst1_s16((int16_t *)p2, d6s16);
-    p2 += stride;
-    vst1_s16((int16_t *)p2, d7s16);
-    vst1_s16((int16_t *)p1, d4s16);
-    return;
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
+  ;                                           \
+  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
+                                                   int stride, int16x8_t q4s16,
+                                                   int16x8_t q5s16,
+                                                   int16x8_t q6s16,
+                                                   int16x8_t q7s16) {
+  int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+  d4s16 = vld1_s16((int16_t *)p1);
+  p1 += stride;
+  d7s16 = vld1_s16((int16_t *)p2);
+  p2 -= stride;
+  d5s16 = vld1_s16((int16_t *)p1);
+  d6s16 = vld1_s16((int16_t *)p2);
+
+  q5s16 = vrshrq_n_s16(q5s16, 6);
+  q6s16 = vrshrq_n_s16(q6s16, 6);
+  q7s16 = vrshrq_n_s16(q7s16, 6);
+  q4s16 = vrshrq_n_s16(q4s16, 6);
+
+  q5s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
+  q6s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
+  q7s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
+  q4s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
+
+  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+  vst1_s16((int16_t *)p1, d5s16);
+  p1 -= stride;
+  vst1_s16((int16_t *)p2, d6s16);
+  p2 += stride;
+  vst1_s16((int16_t *)p2, d7s16);
+  vst1_s16((int16_t *)p1, d4s16);
+  return;
 }
 
 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
-        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(
-        int16x8_t q14s16,
-        int16x8_t q13s16,
-        int16_t first_const,
-        int16_t second_const,
-        int16x8_t *qAs16,
-        int16x8_t *qBs16) {
-    int16x4_t d30s16, d31s16;
-    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
-    int16x4_t dCs16, dDs16, dAs16, dBs16;
-
-    dCs16 = vget_low_s16(q14s16);
-    dDs16 = vget_high_s16(q14s16);
-    dAs16 = vget_low_s16(q13s16);
-    dBs16 = vget_high_s16(q13s16);
-
-    d30s16 = vdup_n_s16(first_const);
-    d31s16 = vdup_n_s16(second_const);
-
-    q8s32 = vmull_s16(dCs16, d30s16);
-    q10s32 = vmull_s16(dAs16, d31s16);
-    q9s32 = vmull_s16(dDs16, d30s16);
-    q11s32 = vmull_s16(dBs16, d31s16);
-    q12s32 = vmull_s16(dCs16, d31s16);
-
-    q8s32 = vsubq_s32(q8s32, q10s32);
-    q9s32 = vsubq_s32(q9s32, q11s32);
-
-    q10s32 = vmull_s16(dDs16, d31s16);
-    q11s32 = vmull_s16(dAs16, d30s16);
-    q15s32 = vmull_s16(dBs16, d30s16);
-
-    q11s32 = vaddq_s32(q12s32, q11s32);
-    q10s32 = vaddq_s32(q10s32, q15s32);
-
-    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
-                          vqrshrn_n_s32(q9s32, 14));
-    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
-                          vqrshrn_n_s32(q10s32, 14));
-    return;
+  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
+                                int16_t first_const, int16_t second_const,
+                                int16x8_t *qAs16, int16x8_t *qBs16) {
+  int16x4_t d30s16, d31s16;
+  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+  int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+  dCs16 = vget_low_s16(q14s16);
+  dDs16 = vget_high_s16(q14s16);
+  dAs16 = vget_low_s16(q13s16);
+  dBs16 = vget_high_s16(q13s16);
+
+  d30s16 = vdup_n_s16(first_const);
+  d31s16 = vdup_n_s16(second_const);
+
+  q8s32 = vmull_s16(dCs16, d30s16);
+  q10s32 = vmull_s16(dAs16, d31s16);
+  q9s32 = vmull_s16(dDs16, d30s16);
+  q11s32 = vmull_s16(dBs16, d31s16);
+  q12s32 = vmull_s16(dCs16, d31s16);
+
+  q8s32 = vsubq_s32(q8s32, q10s32);
+  q9s32 = vsubq_s32(q9s32, q11s32);
+
+  q10s32 = vmull_s16(dDs16, d31s16);
+  q11s32 = vmull_s16(dAs16, d30s16);
+  q15s32 = vmull_s16(dBs16, d30s16);
+
+  q11s32 = vaddq_s32(q12s32, q11s32);
+  q10s32 = vaddq_s32(q10s32, q15s32);
+
+  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
+  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
+  return;
 }
 
-static INLINE void idct32_transpose_pair(
-        int16_t *input,
-        int16_t *t_buf) {
-    int16_t *in;
-    int i;
-    const int stride = 32;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-    for (i = 0; i < 4; i++, input += 8) {
-        in = input;
-        q8s16 = vld1q_s16(in);
-        in += stride;
-        q9s16 = vld1q_s16(in);
-        in += stride;
-        q10s16 = vld1q_s16(in);
-        in += stride;
-        q11s16 = vld1q_s16(in);
-        in += stride;
-        q12s16 = vld1q_s16(in);
-        in += stride;
-        q13s16 = vld1q_s16(in);
-        in += stride;
-        q14s16 = vld1q_s16(in);
-        in += stride;
-        q15s16 = vld1q_s16(in);
-
-        d16s16 = vget_low_s16(q8s16);
-        d17s16 = vget_high_s16(q8s16);
-        d18s16 = vget_low_s16(q9s16);
-        d19s16 = vget_high_s16(q9s16);
-        d20s16 = vget_low_s16(q10s16);
-        d21s16 = vget_high_s16(q10s16);
-        d22s16 = vget_low_s16(q11s16);
-        d23s16 = vget_high_s16(q11s16);
-        d24s16 = vget_low_s16(q12s16);
-        d25s16 = vget_high_s16(q12s16);
-        d26s16 = vget_low_s16(q13s16);
-        d27s16 = vget_high_s16(q13s16);
-        d28s16 = vget_low_s16(q14s16);
-        d29s16 = vget_high_s16(q14s16);
-        d30s16 = vget_low_s16(q15s16);
-        d31s16 = vget_high_s16(q15s16);
-
-        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
-        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
-        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-        q12s16 = vcombine_s16(d17s16, d25s16);
-        q13s16 = vcombine_s16(d19s16, d27s16);
-        q14s16 = vcombine_s16(d21s16, d29s16);
-        q15s16 = vcombine_s16(d23s16, d31s16);
-
-        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
-                            vreinterpretq_s32_s16(q10s16));
-        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
-                            vreinterpretq_s32_s16(q11s16));
-        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
-                            vreinterpretq_s32_s16(q14s16));
-        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
-                            vreinterpretq_s32_s16(q15s16));
-
-        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-        vst1q_s16(t_buf, q0x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q0x2s16.val[1]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q1x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q1x2s16.val[1]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q2x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q2x2s16.val[1]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q3x2s16.val[0]);
-        t_buf += 8;
-        vst1q_s16(t_buf, q3x2s16.val[1]);
-        t_buf += 8;
-    }
-    return;
+static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
+  int16_t *in;
+  int i;
+  const int stride = 32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  for (i = 0; i < 4; i++, input += 8) {
+    in = input;
+    q8s16 = vld1q_s16(in);
+    in += stride;
+    q9s16 = vld1q_s16(in);
+    in += stride;
+    q10s16 = vld1q_s16(in);
+    in += stride;
+    q11s16 = vld1q_s16(in);
+    in += stride;
+    q12s16 = vld1q_s16(in);
+    in += stride;
+    q13s16 = vld1q_s16(in);
+    in += stride;
+    q14s16 = vld1q_s16(in);
+    in += stride;
+    q15s16 = vld1q_s16(in);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+    q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+    q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    q12s16 = vcombine_s16(d17s16, d25s16);
+    q13s16 = vcombine_s16(d19s16, d27s16);
+    q14s16 = vcombine_s16(d21s16, d29s16);
+    q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
+    q1x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
+    q2x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
+    q3x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    vst1q_s16(t_buf, q0x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q0x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q1x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q1x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q2x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q2x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q3x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q3x2s16.val[1]);
+    t_buf += 8;
+  }
+  return;
 }
 
-static INLINE void idct32_bands_end_1st_pass(
-        int16_t *out,
-        int16x8_t q2s16,
-        int16x8_t q3s16,
-        int16x8_t q6s16,
-        int16x8_t q7s16,
-        int16x8_t q8s16,
-        int16x8_t q9s16,
-        int16x8_t q10s16,
-        int16x8_t q11s16,
-        int16x8_t q12s16,
-        int16x8_t q13s16,
-        int16x8_t q14s16,
-        int16x8_t q15s16) {
-    int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
-    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
-    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
-
-    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
-    q2s16 = vaddq_s16(q10s16, q1s16);
-    q3s16 = vaddq_s16(q11s16, q0s16);
-    q4s16 = vsubq_s16(q11s16, q0s16);
-    q5s16 = vsubq_s16(q10s16, q1s16);
-
-    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
-    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
-    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
-
-    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
-    q2s16 = vaddq_s16(q12s16, q1s16);
-    q3s16 = vaddq_s16(q13s16, q0s16);
-    q4s16 = vsubq_s16(q13s16, q0s16);
-    q5s16 = vsubq_s16(q12s16, q1s16);
-
-    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
-    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
-    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
-
-    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
-    q2s16 = vaddq_s16(q14s16, q1s16);
-    q3s16 = vaddq_s16(q15s16, q0s16);
-    q4s16 = vsubq_s16(q15s16, q0s16);
-    q5s16 = vsubq_s16(q14s16, q1s16);
-
-    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
-    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
-    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
-    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
-    return;
+static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
+                                             int16x8_t q3s16, int16x8_t q6s16,
+                                             int16x8_t q7s16, int16x8_t q8s16,
+                                             int16x8_t q9s16, int16x8_t q10s16,
+                                             int16x8_t q11s16, int16x8_t q12s16,
+                                             int16x8_t q13s16, int16x8_t q14s16,
+                                             int16x8_t q15s16) {
+  int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+  q2s16 = vaddq_s16(q10s16, q1s16);
+  q3s16 = vaddq_s16(q11s16, q0s16);
+  q4s16 = vsubq_s16(q11s16, q0s16);
+  q5s16 = vsubq_s16(q10s16, q1s16);
+
+  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+  q2s16 = vaddq_s16(q12s16, q1s16);
+  q3s16 = vaddq_s16(q13s16, q0s16);
+  q4s16 = vsubq_s16(q13s16, q0s16);
+  q5s16 = vsubq_s16(q12s16, q1s16);
+
+  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+  q2s16 = vaddq_s16(q14s16, q1s16);
+  q3s16 = vaddq_s16(q15s16, q0s16);
+  q4s16 = vsubq_s16(q15s16, q0s16);
+  q5s16 = vsubq_s16(q14s16, q1s16);
+
+  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+  return;
 }
 
 static INLINE void idct32_bands_end_2nd_pass(
-        int16_t *out,
-        uint8_t *dest,
-        int stride,
-        int16x8_t q2s16,
-        int16x8_t q3s16,
-        int16x8_t q6s16,
-        int16x8_t q7s16,
-        int16x8_t q8s16,
-        int16x8_t q9s16,
-        int16x8_t q10s16,
-        int16x8_t q11s16,
-        int16x8_t q12s16,
-        int16x8_t q13s16,
-        int16x8_t q14s16,
-        int16x8_t q15s16) {
-    uint8_t *r6  = dest + 31 * stride;
-    uint8_t *r7  = dest/* +  0 * stride*/;
-    uint8_t *r9  = dest + 15 * stride;
-    uint8_t *r10 = dest + 16 * stride;
-    int str2 = stride << 1;
-    int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-    r10 += str2; r9 -= str2;
-
-    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    r7 += str2; r6 -= str2;
-
-    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
-    q2s16 = vaddq_s16(q10s16, q1s16);
-    q3s16 = vaddq_s16(q11s16, q0s16);
-    q4s16 = vsubq_s16(q11s16, q0s16);
-    q5s16 = vsubq_s16(q10s16, q1s16);
-
-    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-    r10 += str2; r9 -= str2;
-
-    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    r7 += str2; r6 -= str2;
-
-    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
-    q2s16 = vaddq_s16(q12s16, q1s16);
-    q3s16 = vaddq_s16(q13s16, q0s16);
-    q4s16 = vsubq_s16(q13s16, q0s16);
-    q5s16 = vsubq_s16(q12s16, q1s16);
-
-    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-    r10 += str2; r9 -= str2;
-
-    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    r7 += str2; r6 -= str2;
-
-    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
-    q2s16 = vaddq_s16(q14s16, q1s16);
-    q3s16 = vaddq_s16(q15s16, q0s16);
-    q4s16 = vsubq_s16(q15s16, q0s16);
-    q5s16 = vsubq_s16(q14s16, q1s16);
-
-    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
-    q8s16 = vaddq_s16(q4s16, q1s16);
-    q9s16 = vaddq_s16(q5s16, q0s16);
-    q6s16 = vsubq_s16(q5s16, q0s16);
-    q7s16 = vsubq_s16(q4s16, q1s16);
-    STORE_COMBINE_CENTER_RESULTS(r10, r9);
-
-    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
-    q4s16 = vaddq_s16(q2s16, q1s16);
-    q5s16 = vaddq_s16(q3s16, q0s16);
-    q6s16 = vsubq_s16(q3s16, q0s16);
-    q7s16 = vsubq_s16(q2s16, q1s16);
-    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-    return;
+    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
+    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
+    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
+    int16x8_t q14s16, int16x8_t q15s16) {
+  uint8_t *r6 = dest + 31 * stride;
+  uint8_t *r7 = dest /* +  0 * stride*/;
+  uint8_t *r9 = dest + 15 * stride;
+  uint8_t *r10 = dest + 16 * stride;
+  int str2 = stride << 1;
+  int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+  q2s16 = vaddq_s16(q10s16, q1s16);
+  q3s16 = vaddq_s16(q11s16, q0s16);
+  q4s16 = vsubq_s16(q11s16, q0s16);
+  q5s16 = vsubq_s16(q10s16, q1s16);
+
+  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+  q2s16 = vaddq_s16(q12s16, q1s16);
+  q3s16 = vaddq_s16(q13s16, q0s16);
+  q4s16 = vsubq_s16(q13s16, q0s16);
+  q5s16 = vsubq_s16(q12s16, q1s16);
+
+  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+  q2s16 = vaddq_s16(q14s16, q1s16);
+  q3s16 = vaddq_s16(q15s16, q0s16);
+  q4s16 = vsubq_s16(q15s16, q0s16);
+  q5s16 = vsubq_s16(q14s16, q1s16);
+
+  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  return;
 }
 
-void vpx_idct32x32_1024_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int stride) {
-    int i, idct32_pass_loop;
-    int16_t trans_buf[32 * 8];
-    int16_t pass1[32 * 32];
-    int16_t pass2[32 * 32];
-    int16_t *out;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-
-    for (idct32_pass_loop = 0, out = pass1;
-         idct32_pass_loop < 2;
-         idct32_pass_loop++,
-         input = pass1,  // the input of pass2 is the result of pass1
-         out = pass2) {
-        for (i = 0;
-             i < 4; i++,
-             input += 32 * 8, out += 8) {  // idct32_bands_loop
-            idct32_transpose_pair(input, trans_buf);
-
-            // -----------------------------------------
-            // BLOCK A: 16-19,28-31
-            // -----------------------------------------
-            // generate 16,17,30,31
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(0, 1, 31)
-            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(31, 17, 15)
-            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
-            // part of stage 2
-            q4s16 = vaddq_s16(q0s16, q1s16);
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q6s16 = vaddq_s16(q2s16, q3s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
-
-            // generate 18,19,28,29
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(15, 9, 23)
-            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(23, 25, 7)
-            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
-            // part of stage 2
-            q13s16 = vsubq_s16(q3s16, q2s16);
-            q3s16 = vaddq_s16(q3s16, q2s16);
-            q14s16 = vsubq_s16(q1s16, q0s16);
-            q2s16 = vaddq_s16(q1s16, q0s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
-            // part of stage 4
-            q8s16 = vaddq_s16(q4s16, q2s16);
-            q9s16 = vaddq_s16(q5s16, q0s16);
-            q10s16 = vaddq_s16(q7s16, q1s16);
-            q15s16 = vaddq_s16(q6s16, q3s16);
-            q13s16 = vsubq_s16(q5s16, q0s16);
-            q14s16 = vsubq_s16(q7s16, q1s16);
-            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
-            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
-            // part of stage 5
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
-            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
-            // part of stage 4
-            q13s16 = vsubq_s16(q4s16, q2s16);
-            q14s16 = vsubq_s16(q6s16, q3s16);
-            // part of stage 5
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
-            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
-
-            // -----------------------------------------
-            // BLOCK B: 20-23,24-27
-            // -----------------------------------------
-            // generate 20,21,26,27
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(7, 5, 27)
-            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(27, 21, 11)
-            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
-            // part of stage 2
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q0s16 = vaddq_s16(q0s16, q1s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            q2s16 = vaddq_s16(q2s16, q3s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-
-            // generate 22,23,24,25
-            // part of stage 1
-            LOAD_FROM_TRANSPOSED(11, 13, 19)
-            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
-            LOAD_FROM_TRANSPOSED(19, 29, 3)
-            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
-            // part of stage 2
-            q14s16 = vsubq_s16(q4s16, q5s16);
-            q5s16  = vaddq_s16(q4s16, q5s16);
-            q13s16 = vsubq_s16(q6s16, q7s16);
-            q6s16  = vaddq_s16(q6s16, q7s16);
-            // part of stage 3
-            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
-            // part of stage 4
-            q10s16 = vaddq_s16(q7s16, q1s16);
-            q11s16 = vaddq_s16(q5s16, q0s16);
-            q12s16 = vaddq_s16(q6s16, q2s16);
-            q15s16 = vaddq_s16(q4s16, q3s16);
-            // part of stage 6
-            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
-            q8s16 = vaddq_s16(q14s16, q11s16);
-            q9s16 = vaddq_s16(q13s16, q10s16);
-            q13s16 = vsubq_s16(q13s16, q10s16);
-            q11s16 = vsubq_s16(q14s16, q11s16);
-            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
-            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
-            q8s16  = vsubq_s16(q9s16, q12s16);
-            q10s16 = vaddq_s16(q14s16, q15s16);
-            q14s16 = vsubq_s16(q14s16, q15s16);
-            q12s16 = vaddq_s16(q9s16, q12s16);
-            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
-            // part of stage 7
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
-            q13s16 = q11s16;
-            q14s16 = q8s16;
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
-            // part of stage 4
-            q14s16 = vsubq_s16(q5s16, q0s16);
-            q13s16 = vsubq_s16(q6s16, q2s16);
-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
-            q14s16 = vsubq_s16(q7s16, q1s16);
-            q13s16 = vsubq_s16(q4s16, q3s16);
-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
-            // part of stage 6
-            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
-            q8s16 = vaddq_s16(q14s16, q1s16);
-            q9s16 = vaddq_s16(q13s16, q6s16);
-            q13s16 = vsubq_s16(q13s16, q6s16);
-            q1s16 = vsubq_s16(q14s16, q1s16);
-            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
-            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
-            q14s16 = vsubq_s16(q8s16, q5s16);
-            q10s16 = vaddq_s16(q8s16, q5s16);
-            q11s16 = vaddq_s16(q9s16, q0s16);
-            q0s16 = vsubq_s16(q9s16, q0s16);
-            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
-            // part of stage 7
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
-            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
-                                                         &q1s16, &q0s16);
-            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
-
-            // -----------------------------------------
-            // BLOCK C: 8-10,11-15
-            // -----------------------------------------
-            // generate 8,9,14,15
-            // part of stage 2
-            LOAD_FROM_TRANSPOSED(3, 2, 30)
-            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(30, 18, 14)
-            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
-            // part of stage 3
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q0s16 = vaddq_s16(q0s16, q1s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            q2s16 = vaddq_s16(q2s16, q3s16);
-            // part of stage 4
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
-
-            // generate 10,11,12,13
-            // part of stage 2
-            LOAD_FROM_TRANSPOSED(14, 10, 22)
-            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
-            LOAD_FROM_TRANSPOSED(22, 26, 6)
-            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
-            // part of stage 3
-            q14s16 = vsubq_s16(q4s16, q5s16);
-            q5s16 = vaddq_s16(q4s16, q5s16);
-            q13s16 = vsubq_s16(q6s16, q7s16);
-            q6s16 = vaddq_s16(q6s16, q7s16);
-            // part of stage 4
-            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
-            // part of stage 5
-            q8s16 = vaddq_s16(q0s16, q5s16);
-            q9s16 = vaddq_s16(q1s16, q7s16);
-            q13s16 = vsubq_s16(q1s16, q7s16);
-            q14s16 = vsubq_s16(q3s16, q4s16);
-            q10s16 = vaddq_s16(q3s16, q4s16);
-            q15s16 = vaddq_s16(q2s16, q6s16);
-            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
-            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
-            // part of stage 6
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
-            q13s16 = vsubq_s16(q0s16, q5s16);
-            q14s16 = vsubq_s16(q2s16, q6s16);
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
-
-            // -----------------------------------------
-            // BLOCK D: 0-3,4-7
-            // -----------------------------------------
-            // generate 4,5,6,7
-            // part of stage 3
-            LOAD_FROM_TRANSPOSED(6, 4, 28)
-            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
-            LOAD_FROM_TRANSPOSED(28, 20, 12)
-            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-            // part of stage 4
-            q13s16 = vsubq_s16(q0s16, q1s16);
-            q0s16 = vaddq_s16(q0s16, q1s16);
-            q14s16 = vsubq_s16(q2s16, q3s16);
-            q2s16 = vaddq_s16(q2s16, q3s16);
-            // part of stage 5
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-
-            // generate 0,1,2,3
-            // part of stage 4
-            LOAD_FROM_TRANSPOSED(12, 0, 16)
-            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
-            LOAD_FROM_TRANSPOSED(16, 8, 24)
-            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
-            // part of stage 5
-            q4s16 = vaddq_s16(q7s16, q6s16);
-            q7s16 = vsubq_s16(q7s16, q6s16);
-            q6s16 = vsubq_s16(q5s16, q14s16);
-            q5s16 = vaddq_s16(q5s16, q14s16);
-            // part of stage 6
-            q8s16 = vaddq_s16(q4s16, q2s16);
-            q9s16 = vaddq_s16(q5s16, q3s16);
-            q10s16 = vaddq_s16(q6s16, q1s16);
-            q11s16 = vaddq_s16(q7s16, q0s16);
-            q12s16 = vsubq_s16(q7s16, q0s16);
-            q13s16 = vsubq_s16(q6s16, q1s16);
-            q14s16 = vsubq_s16(q5s16, q3s16);
-            q15s16 = vsubq_s16(q4s16, q2s16);
-            // part of stage 7
-            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
-            q2s16 = vaddq_s16(q8s16, q1s16);
-            q3s16 = vaddq_s16(q9s16, q0s16);
-            q4s16 = vsubq_s16(q9s16, q0s16);
-            q5s16 = vsubq_s16(q8s16, q1s16);
-            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
-            q8s16 = vaddq_s16(q4s16, q1s16);
-            q9s16 = vaddq_s16(q5s16, q0s16);
-            q6s16 = vsubq_s16(q5s16, q0s16);
-            q7s16 = vsubq_s16(q4s16, q1s16);
-
-            if (idct32_pass_loop == 0) {
-                idct32_bands_end_1st_pass(out,
-                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
-            } else {
-                idct32_bands_end_2nd_pass(out, dest, stride,
-                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
-                dest += 8;
-            }
-        }
+void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
+  int i, idct32_pass_loop;
+  int16_t trans_buf[32 * 8];
+  int16_t pass1[32 * 32];
+  int16_t pass2[32 * 32];
+  int16_t *out;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+       idct32_pass_loop++,
+      input = pass1,  // the input of pass2 is the result of pass1
+       out = pass2) {
+    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
+      idct32_transpose_pair(input, trans_buf);
+
+      // -----------------------------------------
+      // BLOCK A: 16-19,28-31
+      // -----------------------------------------
+      // generate 16,17,30,31
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(0, 1, 31)
+      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(31, 17, 15)
+      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+      // part of stage 2
+      q4s16 = vaddq_s16(q0s16, q1s16);
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q6s16 = vaddq_s16(q2s16, q3s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+      // generate 18,19,28,29
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(15, 9, 23)
+      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(23, 25, 7)
+      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+      // part of stage 2
+      q13s16 = vsubq_s16(q3s16, q2s16);
+      q3s16 = vaddq_s16(q3s16, q2s16);
+      q14s16 = vsubq_s16(q1s16, q0s16);
+      q2s16 = vaddq_s16(q1s16, q0s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+      // part of stage 4
+      q8s16 = vaddq_s16(q4s16, q2s16);
+      q9s16 = vaddq_s16(q5s16, q0s16);
+      q10s16 = vaddq_s16(q7s16, q1s16);
+      q15s16 = vaddq_s16(q6s16, q3s16);
+      q13s16 = vsubq_s16(q5s16, q0s16);
+      q14s16 = vsubq_s16(q7s16, q1s16);
+      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+      // part of stage 4
+      q13s16 = vsubq_s16(q4s16, q2s16);
+      q14s16 = vsubq_s16(q6s16, q3s16);
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+      // -----------------------------------------
+      // BLOCK B: 20-23,24-27
+      // -----------------------------------------
+      // generate 20,21,26,27
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(7, 5, 27)
+      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(27, 21, 11)
+      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+      // part of stage 2
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+      // generate 22,23,24,25
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(11, 13, 19)
+      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(19, 29, 3)
+      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+      // part of stage 2
+      q14s16 = vsubq_s16(q4s16, q5s16);
+      q5s16 = vaddq_s16(q4s16, q5s16);
+      q13s16 = vsubq_s16(q6s16, q7s16);
+      q6s16 = vaddq_s16(q6s16, q7s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+      // part of stage 4
+      q10s16 = vaddq_s16(q7s16, q1s16);
+      q11s16 = vaddq_s16(q5s16, q0s16);
+      q12s16 = vaddq_s16(q6s16, q2s16);
+      q15s16 = vaddq_s16(q4s16, q3s16);
+      // part of stage 6
+      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+      q8s16 = vaddq_s16(q14s16, q11s16);
+      q9s16 = vaddq_s16(q13s16, q10s16);
+      q13s16 = vsubq_s16(q13s16, q10s16);
+      q11s16 = vsubq_s16(q14s16, q11s16);
+      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+      q8s16 = vsubq_s16(q9s16, q12s16);
+      q10s16 = vaddq_s16(q14s16, q15s16);
+      q14s16 = vsubq_s16(q14s16, q15s16);
+      q12s16 = vaddq_s16(q9s16, q12s16);
+      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+      // part of stage 7
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+      q13s16 = q11s16;
+      q14s16 = q8s16;
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+      // part of stage 4
+      q14s16 = vsubq_s16(q5s16, q0s16);
+      q13s16 = vsubq_s16(q6s16, q2s16);
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+      q14s16 = vsubq_s16(q7s16, q1s16);
+      q13s16 = vsubq_s16(q4s16, q3s16);
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+      // part of stage 6
+      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+      q8s16 = vaddq_s16(q14s16, q1s16);
+      q9s16 = vaddq_s16(q13s16, q6s16);
+      q13s16 = vsubq_s16(q13s16, q6s16);
+      q1s16 = vsubq_s16(q14s16, q1s16);
+      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+      q14s16 = vsubq_s16(q8s16, q5s16);
+      q10s16 = vaddq_s16(q8s16, q5s16);
+      q11s16 = vaddq_s16(q9s16, q0s16);
+      q0s16 = vsubq_s16(q9s16, q0s16);
+      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+      // part of stage 7
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
+      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+      // -----------------------------------------
+      // BLOCK C: 8-10,11-15
+      // -----------------------------------------
+      // generate 8,9,14,15
+      // part of stage 2
+      LOAD_FROM_TRANSPOSED(3, 2, 30)
+      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(30, 18, 14)
+      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+      // part of stage 3
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 4
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+      // generate 10,11,12,13
+      // part of stage 2
+      LOAD_FROM_TRANSPOSED(14, 10, 22)
+      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(22, 26, 6)
+      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+      // part of stage 3
+      q14s16 = vsubq_s16(q4s16, q5s16);
+      q5s16 = vaddq_s16(q4s16, q5s16);
+      q13s16 = vsubq_s16(q6s16, q7s16);
+      q6s16 = vaddq_s16(q6s16, q7s16);
+      // part of stage 4
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+      // part of stage 5
+      q8s16 = vaddq_s16(q0s16, q5s16);
+      q9s16 = vaddq_s16(q1s16, q7s16);
+      q13s16 = vsubq_s16(q1s16, q7s16);
+      q14s16 = vsubq_s16(q3s16, q4s16);
+      q10s16 = vaddq_s16(q3s16, q4s16);
+      q15s16 = vaddq_s16(q2s16, q6s16);
+      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+      // part of stage 6
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+      q13s16 = vsubq_s16(q0s16, q5s16);
+      q14s16 = vsubq_s16(q2s16, q6s16);
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+      // -----------------------------------------
+      // BLOCK D: 0-3,4-7
+      // -----------------------------------------
+      // generate 4,5,6,7
+      // part of stage 3
+      LOAD_FROM_TRANSPOSED(6, 4, 28)
+      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(28, 20, 12)
+      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+      // part of stage 4
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+      // generate 0,1,2,3
+      // part of stage 4
+      LOAD_FROM_TRANSPOSED(12, 0, 16)
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(16, 8, 24)
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+      // part of stage 5
+      q4s16 = vaddq_s16(q7s16, q6s16);
+      q7s16 = vsubq_s16(q7s16, q6s16);
+      q6s16 = vsubq_s16(q5s16, q14s16);
+      q5s16 = vaddq_s16(q5s16, q14s16);
+      // part of stage 6
+      q8s16 = vaddq_s16(q4s16, q2s16);
+      q9s16 = vaddq_s16(q5s16, q3s16);
+      q10s16 = vaddq_s16(q6s16, q1s16);
+      q11s16 = vaddq_s16(q7s16, q0s16);
+      q12s16 = vsubq_s16(q7s16, q0s16);
+      q13s16 = vsubq_s16(q6s16, q1s16);
+      q14s16 = vsubq_s16(q5s16, q3s16);
+      q15s16 = vsubq_s16(q4s16, q2s16);
+      // part of stage 7
+      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+      q2s16 = vaddq_s16(q8s16, q1s16);
+      q3s16 = vaddq_s16(q9s16, q0s16);
+      q4s16 = vsubq_s16(q9s16, q0s16);
+      q5s16 = vsubq_s16(q8s16, q1s16);
+      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+      q8s16 = vaddq_s16(q4s16, q1s16);
+      q9s16 = vaddq_s16(q5s16, q0s16);
+      q6s16 = vsubq_s16(q5s16, q0s16);
+      q7s16 = vsubq_s16(q4s16, q1s16);
+
+      if (idct32_pass_loop == 0) {
+        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                                  q10s16, q11s16, q12s16, q13s16, q14s16,
+                                  q15s16);
+      } else {
+        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
+                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
+                                  q14s16, q15s16);
+        dest += 8;
+      }
     }
-    return;
+  }
+  return;
 }
diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.c b/vpx_dsp/arm/idct4x4_1_add_neon.c
index ea618700c95457f77b1a361754446ce98eb5eba4..9f999e979d7fd1721db04c2475a36902463879f5 100644
--- a/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -13,38 +13,34 @@
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
-void vpx_idct4x4_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d6u8;
-    uint32x2_t d2u32 = vdup_n_u32(0);
-    uint16x8_t q8u16;
-    int16x8_t q0s16;
-    uint8_t *d1, *d2;
-    int16_t i, a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 4);
+void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d6u8;
+  uint32x2_t d2u32 = vdup_n_u32(0);
+  uint16x8_t q8u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
 
-    q0s16 = vdupq_n_s16(a1);
+  q0s16 = vdupq_n_s16(a1);
 
-    // dc_only_idct_add
-    d1 = d2 = dest;
-    for (i = 0; i < 2; i++) {
-        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
-        d1 += dest_stride;
-        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
-        d1 += dest_stride;
+  // dc_only_idct_add
+  d1 = d2 = dest;
+  for (i = 0; i < 2; i++) {
+    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+    d1 += dest_stride;
+    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+    d1 += dest_stride;
 
-        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
-                         vreinterpret_u8_u32(d2u32));
-        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
+    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
 
-        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
-        d2 += dest_stride;
-        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
-        d2 += dest_stride;
-    }
-    return;
+    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+    d2 += dest_stride;
+    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+    d2 += dest_stride;
+  }
+  return;
 }
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c
index 3c975c99b771c7fbe063fecba3dde0d469e116ef..382626928bfdcd15dffad54bff915edd6b845f4f 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -10,142 +10,137 @@
 
 #include <arm_neon.h>
 
-void vpx_idct4x4_16_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d26u8, d27u8;
-    uint32x2_t d26u32, d27u32;
-    uint16x8_t q8u16, q9u16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
-    int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
-    int16x8_t q8s16, q9s16, q13s16, q14s16;
-    int32x4_t q1s32, q13s32, q14s32, q15s32;
-    int16x4x2_t d0x2s16, d1x2s16;
-    int32x4x2_t q0x2s32;
-    uint8_t *d;
-    int16_t cospi_8_64 = 15137;
-    int16_t cospi_16_64 = 11585;
-    int16_t cospi_24_64 = 6270;
-
-    d26u32 = d27u32 = vdup_n_u32(0);
-
-    q8s16 = vld1q_s16(input);
-    q9s16 = vld1q_s16(input + 8);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-
-    d0x2s16 = vtrn_s16(d16s16, d17s16);
-    d1x2s16 = vtrn_s16(d18s16, d19s16);
-    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-    d20s16 = vdup_n_s16(cospi_8_64);
-    d21s16 = vdup_n_s16(cospi_16_64);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
-                        vreinterpretq_s32_s16(q9s16));
-    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-    d22s16 = vdup_n_s16(cospi_24_64);
-
-    // stage 1
-    d23s16 = vadd_s16(d16s16, d18s16);
-    d24s16 = vsub_s16(d16s16, d18s16);
-
-    q15s32 = vmull_s16(d17s16, d22s16);
-    q1s32  = vmull_s16(d17s16, d20s16);
-    q13s32 = vmull_s16(d23s16, d21s16);
-    q14s32 = vmull_s16(d24s16, d21s16);
-
-    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
-
-    d26s16 = vqrshrn_n_s32(q13s32, 14);
-    d27s16 = vqrshrn_n_s32(q14s32, 14);
-    d29s16 = vqrshrn_n_s32(q15s32, 14);
-    d28s16 = vqrshrn_n_s32(q1s32,  14);
-    q13s16 = vcombine_s16(d26s16, d27s16);
-    q14s16 = vcombine_s16(d28s16, d29s16);
-
-    // stage 2
-    q8s16 = vaddq_s16(q13s16, q14s16);
-    q9s16 = vsubq_s16(q13s16, q14s16);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
-    d19s16 = vget_low_s16(q9s16);
-
-    d0x2s16 = vtrn_s16(d16s16, d17s16);
-    d1x2s16 = vtrn_s16(d18s16, d19s16);
-    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
-                        vreinterpretq_s32_s16(q9s16));
-    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-    // do the transform on columns
-    // stage 1
-    d23s16 = vadd_s16(d16s16, d18s16);
-    d24s16 = vsub_s16(d16s16, d18s16);
-
-    q15s32 = vmull_s16(d17s16, d22s16);
-    q1s32  = vmull_s16(d17s16, d20s16);
-    q13s32 = vmull_s16(d23s16, d21s16);
-    q14s32 = vmull_s16(d24s16, d21s16);
-
-    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
-
-    d26s16 = vqrshrn_n_s32(q13s32, 14);
-    d27s16 = vqrshrn_n_s32(q14s32, 14);
-    d29s16 = vqrshrn_n_s32(q15s32, 14);
-    d28s16 = vqrshrn_n_s32(q1s32,  14);
-    q13s16 = vcombine_s16(d26s16, d27s16);
-    q14s16 = vcombine_s16(d28s16, d29s16);
-
-    // stage 2
-    q8s16 = vaddq_s16(q13s16, q14s16);
-    q9s16 = vsubq_s16(q13s16, q14s16);
-
-    q8s16 = vrshrq_n_s16(q8s16, 4);
-    q9s16 = vrshrq_n_s16(q9s16, 4);
-
-    d = dest;
-    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
-    d += dest_stride;
-    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
-    d += dest_stride;
-    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
-    d += dest_stride;
-    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u32(d26u32));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u32(d27u32));
-
-    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-    d = dest;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
-    d += dest_stride;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
-    d += dest_stride;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
-    d += dest_stride;
-    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
-    return;
+void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d26u8, d27u8;
+  uint32x2_t d26u32, d27u32;
+  uint16x8_t q8u16, q9u16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+  int16x8_t q8s16, q9s16, q13s16, q14s16;
+  int32x4_t q1s32, q13s32, q14s32, q15s32;
+  int16x4x2_t d0x2s16, d1x2s16;
+  int32x4x2_t q0x2s32;
+  uint8_t *d;
+  int16_t cospi_8_64 = 15137;
+  int16_t cospi_16_64 = 11585;
+  int16_t cospi_24_64 = 6270;
+
+  d26u32 = d27u32 = vdup_n_u32(0);
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+
+  d0x2s16 = vtrn_s16(d16s16, d17s16);
+  d1x2s16 = vtrn_s16(d18s16, d19s16);
+  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+  d20s16 = vdup_n_s16(cospi_8_64);
+  d21s16 = vdup_n_s16(cospi_16_64);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+  d22s16 = vdup_n_s16(cospi_24_64);
+
+  // stage 1
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, d22s16);
+  q1s32 = vmull_s16(d17s16, d20s16);
+  q13s32 = vmull_s16(d23s16, d21s16);
+  q14s32 = vmull_s16(d24s16, d21s16);
+
+  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+  d26s16 = vqrshrn_n_s32(q13s32, 14);
+  d27s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d28s16 = vqrshrn_n_s32(q1s32, 14);
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+
+  // stage 2
+  q8s16 = vaddq_s16(q13s16, q14s16);
+  q9s16 = vsubq_s16(q13s16, q14s16);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
+  d19s16 = vget_low_s16(q9s16);
+
+  d0x2s16 = vtrn_s16(d16s16, d17s16);
+  d1x2s16 = vtrn_s16(d18s16, d19s16);
+  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+  // do the transform on columns
+  // stage 1
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, d22s16);
+  q1s32 = vmull_s16(d17s16, d20s16);
+  q13s32 = vmull_s16(d23s16, d21s16);
+  q14s32 = vmull_s16(d24s16, d21s16);
+
+  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+  d26s16 = vqrshrn_n_s32(q13s32, 14);
+  d27s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d28s16 = vqrshrn_n_s32(q1s32, 14);
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+
+  // stage 2
+  q8s16 = vaddq_s16(q13s16, q14s16);
+  q9s16 = vsubq_s16(q13s16, q14s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 4);
+  q9s16 = vrshrq_n_s16(q9s16, 4);
+
+  d = dest;
+  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+  d += dest_stride;
+  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+  d += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+  d += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+  d = dest;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+  return;
 }
diff --git a/vpx_dsp/arm/idct8x8_1_add_neon.c b/vpx_dsp/arm/idct8x8_1_add_neon.c
index c1b801fad54390af8544f994de6bc9a2f98cae94..e3db0b876bc2cb5cf3fe2db7ee70ee99a555e639 100644
--- a/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -13,52 +13,49 @@
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
-void vpx_idct8x8_1_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8x8_t d2u8, d3u8, d30u8, d31u8;
-    uint64x1_t d2u64, d3u64, d4u64, d5u64;
-    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-    int16x8_t q0s16;
-    uint8_t *d1, *d2;
-    int16_t i, a1, cospi_16_64 = 11585;
-    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-    out = dct_const_round_shift(out * cospi_16_64);
-    a1 = ROUND_POWER_OF_TWO(out, 5);
-
-    q0s16 = vdupq_n_s16(a1);
-    q0u16 = vreinterpretq_u16_s16(q0s16);
-
-    d1 = d2 = dest;
-    for (i = 0; i < 2; i++) {
-        d2u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-        d3u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-        d4u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-        d5u64 = vld1_u64((const uint64_t *)d1);
-        d1 += dest_stride;
-
-        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-        d2 += dest_stride;
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-        d2 += dest_stride;
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-        d2 += dest_stride;
-        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
-        d2 += dest_stride;
-    }
-    return;
+void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d2u8, d3u8, d30u8, d31u8;
+  uint64x1_t d2u64, d3u64, d4u64, d5u64;
+  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, a1, cospi_16_64 = 11585;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+
+  q0s16 = vdupq_n_s16(a1);
+  q0u16 = vreinterpretq_u16_s16(q0s16);
+
+  d1 = d2 = dest;
+  for (i = 0; i < 2; i++) {
+    d2u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d4u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d5u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+
+    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+    d2 += dest_stride;
+  }
+  return;
 }
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c
index 4b2c2a6f83c0ce731605827984aa658dd662ada4..f1c271110dea9fedc801d36e2e416676c5d81c53 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -13,528 +13,496 @@
 #include "./vpx_config.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void TRANSPOSE8X8(
-        int16x8_t *q8s16,
-        int16x8_t *q9s16,
-        int16x8_t *q10s16,
-        int16x8_t *q11s16,
-        int16x8_t *q12s16,
-        int16x8_t *q13s16,
-        int16x8_t *q14s16,
-        int16x8_t *q15s16) {
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-    d16s16 = vget_low_s16(*q8s16);
-    d17s16 = vget_high_s16(*q8s16);
-    d18s16 = vget_low_s16(*q9s16);
-    d19s16 = vget_high_s16(*q9s16);
-    d20s16 = vget_low_s16(*q10s16);
-    d21s16 = vget_high_s16(*q10s16);
-    d22s16 = vget_low_s16(*q11s16);
-    d23s16 = vget_high_s16(*q11s16);
-    d24s16 = vget_low_s16(*q12s16);
-    d25s16 = vget_high_s16(*q12s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-    d30s16 = vget_low_s16(*q15s16);
-    d31s16 = vget_high_s16(*q15s16);
-
-    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
-    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
-    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-    *q12s16 = vcombine_s16(d17s16, d25s16);
-    *q13s16 = vcombine_s16(d19s16, d27s16);
-    *q14s16 = vcombine_s16(d21s16, d29s16);
-    *q15s16 = vcombine_s16(d23s16, d31s16);
-
-    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
-                        vreinterpretq_s32_s16(*q10s16));
-    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
-                        vreinterpretq_s32_s16(*q11s16));
-    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
-                        vreinterpretq_s32_s16(*q14s16));
-    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
-                        vreinterpretq_s32_s16(*q15s16));
-
-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-    *q8s16  = q0x2s16.val[0];
-    *q9s16  = q0x2s16.val[1];
-    *q10s16 = q1x2s16.val[0];
-    *q11s16 = q1x2s16.val[1];
-    *q12s16 = q2x2s16.val[0];
-    *q13s16 = q2x2s16.val[1];
-    *q14s16 = q3x2s16.val[0];
-    *q15s16 = q3x2s16.val[1];
-    return;
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+                                int16x8_t *q10s16, int16x8_t *q11s16,
+                                int16x8_t *q12s16, int16x8_t *q13s16,
+                                int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+  *q12s16 = vcombine_s16(d17s16, d25s16);
+  *q13s16 = vcombine_s16(d19s16, d27s16);
+  *q14s16 = vcombine_s16(d21s16, d29s16);
+  *q15s16 = vcombine_s16(d23s16, d31s16);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+  q1x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+  q2x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+  q3x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+  *q8s16 = q0x2s16.val[0];
+  *q9s16 = q0x2s16.val[1];
+  *q10s16 = q1x2s16.val[0];
+  *q11s16 = q1x2s16.val[1];
+  *q12s16 = q2x2s16.val[0];
+  *q13s16 = q2x2s16.val[1];
+  *q14s16 = q3x2s16.val[0];
+  *q15s16 = q3x2s16.val[1];
+  return;
 }
 
-static INLINE void IDCT8x8_1D(
-        int16x8_t *q8s16,
-        int16x8_t *q9s16,
-        int16x8_t *q10s16,
-        int16x8_t *q11s16,
-        int16x8_t *q12s16,
-        int16x8_t *q13s16,
-        int16x8_t *q14s16,
-        int16x8_t *q15s16) {
-    int16x4_t d0s16, d1s16, d2s16, d3s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-    d0s16 = vdup_n_s16(cospi_28_64);
-    d1s16 = vdup_n_s16(cospi_4_64);
-    d2s16 = vdup_n_s16(cospi_12_64);
-    d3s16 = vdup_n_s16(cospi_20_64);
-
-    d16s16 = vget_low_s16(*q8s16);
-    d17s16 = vget_high_s16(*q8s16);
-    d18s16 = vget_low_s16(*q9s16);
-    d19s16 = vget_high_s16(*q9s16);
-    d20s16 = vget_low_s16(*q10s16);
-    d21s16 = vget_high_s16(*q10s16);
-    d22s16 = vget_low_s16(*q11s16);
-    d23s16 = vget_high_s16(*q11s16);
-    d24s16 = vget_low_s16(*q12s16);
-    d25s16 = vget_high_s16(*q12s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-    d30s16 = vget_low_s16(*q15s16);
-    d31s16 = vget_high_s16(*q15s16);
-
-    q2s32 = vmull_s16(d18s16, d0s16);
-    q3s32 = vmull_s16(d19s16, d0s16);
-    q5s32 = vmull_s16(d26s16, d2s16);
-    q6s32 = vmull_s16(d27s16, d2s16);
-
-    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-    d8s16 = vqrshrn_n_s32(q2s32, 14);
-    d9s16 = vqrshrn_n_s32(q3s32, 14);
-    d10s16 = vqrshrn_n_s32(q5s32, 14);
-    d11s16 = vqrshrn_n_s32(q6s32, 14);
-    q4s16 = vcombine_s16(d8s16, d9s16);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-
-    q2s32 = vmull_s16(d18s16, d1s16);
-    q3s32 = vmull_s16(d19s16, d1s16);
-    q9s32 = vmull_s16(d26s16, d3s16);
-    q13s32 = vmull_s16(d27s16, d3s16);
-
-    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-    d14s16 = vqrshrn_n_s32(q2s32, 14);
-    d15s16 = vqrshrn_n_s32(q3s32, 14);
-    d12s16 = vqrshrn_n_s32(q9s32, 14);
-    d13s16 = vqrshrn_n_s32(q13s32, 14);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-    q7s16 = vcombine_s16(d14s16, d15s16);
-
-    d0s16 = vdup_n_s16(cospi_16_64);
-
-    q2s32 = vmull_s16(d16s16, d0s16);
-    q3s32 = vmull_s16(d17s16, d0s16);
-    q13s32 = vmull_s16(d16s16, d0s16);
-    q15s32 = vmull_s16(d17s16, d0s16);
-
-    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-    d0s16 = vdup_n_s16(cospi_24_64);
-    d1s16 = vdup_n_s16(cospi_8_64);
-
-    d18s16 = vqrshrn_n_s32(q2s32, 14);
-    d19s16 = vqrshrn_n_s32(q3s32, 14);
-    d22s16 = vqrshrn_n_s32(q13s32, 14);
-    d23s16 = vqrshrn_n_s32(q15s32, 14);
-    *q9s16 = vcombine_s16(d18s16, d19s16);
-    *q11s16 = vcombine_s16(d22s16, d23s16);
-
-    q2s32 = vmull_s16(d20s16, d0s16);
-    q3s32 = vmull_s16(d21s16, d0s16);
-    q8s32 = vmull_s16(d20s16, d1s16);
-    q12s32 = vmull_s16(d21s16, d1s16);
-
-    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-    d26s16 = vqrshrn_n_s32(q2s32, 14);
-    d27s16 = vqrshrn_n_s32(q3s32, 14);
-    d30s16 = vqrshrn_n_s32(q8s32, 14);
-    d31s16 = vqrshrn_n_s32(q12s32, 14);
-    *q13s16 = vcombine_s16(d26s16, d27s16);
-    *q15s16 = vcombine_s16(d30s16, d31s16);
-
-    q0s16 = vaddq_s16(*q9s16, *q15s16);
-    q1s16 = vaddq_s16(*q11s16, *q13s16);
-    q2s16 = vsubq_s16(*q11s16, *q13s16);
-    q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-    *q13s16 = vsubq_s16(q4s16, q5s16);
-    q4s16 = vaddq_s16(q4s16, q5s16);
-    *q14s16 = vsubq_s16(q7s16, q6s16);
-    q7s16 = vaddq_s16(q7s16, q6s16);
-    d26s16 = vget_low_s16(*q13s16);
-    d27s16 = vget_high_s16(*q13s16);
-    d28s16 = vget_low_s16(*q14s16);
-    d29s16 = vget_high_s16(*q14s16);
-
-    d16s16 = vdup_n_s16(cospi_16_64);
-
-    q9s32 = vmull_s16(d28s16, d16s16);
-    q10s32 = vmull_s16(d29s16, d16s16);
-    q11s32 = vmull_s16(d28s16, d16s16);
-    q12s32 = vmull_s16(d29s16, d16s16);
-
-    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
-    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-    d10s16 = vqrshrn_n_s32(q9s32, 14);
-    d11s16 = vqrshrn_n_s32(q10s32, 14);
-    d12s16 = vqrshrn_n_s32(q11s32, 14);
-    d13s16 = vqrshrn_n_s32(q12s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    *q8s16 = vaddq_s16(q0s16, q7s16);
-    *q9s16 = vaddq_s16(q1s16, q6s16);
-    *q10s16 = vaddq_s16(q2s16, q5s16);
-    *q11s16 = vaddq_s16(q3s16, q4s16);
-    *q12s16 = vsubq_s16(q3s16, q4s16);
-    *q13s16 = vsubq_s16(q2s16, q5s16);
-    *q14s16 = vsubq_s16(q1s16, q6s16);
-    *q15s16 = vsubq_s16(q0s16, q7s16);
-    return;
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+                              int16x8_t *q10s16, int16x8_t *q11s16,
+                              int16x8_t *q12s16, int16x8_t *q13s16,
+                              int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+  d0s16 = vdup_n_s16(cospi_28_64);
+  d1s16 = vdup_n_s16(cospi_4_64);
+  d2s16 = vdup_n_s16(cospi_12_64);
+  d3s16 = vdup_n_s16(cospi_20_64);
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  q2s32 = vmull_s16(d18s16, d0s16);
+  q3s32 = vmull_s16(d19s16, d0s16);
+  q5s32 = vmull_s16(d26s16, d2s16);
+  q6s32 = vmull_s16(d27s16, d2s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+  d8s16 = vqrshrn_n_s32(q2s32, 14);
+  d9s16 = vqrshrn_n_s32(q3s32, 14);
+  d10s16 = vqrshrn_n_s32(q5s32, 14);
+  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q2s32 = vmull_s16(d18s16, d1s16);
+  q3s32 = vmull_s16(d19s16, d1s16);
+  q9s32 = vmull_s16(d26s16, d3s16);
+  q13s32 = vmull_s16(d27s16, d3s16);
+
+  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+  d14s16 = vqrshrn_n_s32(q2s32, 14);
+  d15s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q13s32, 14);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  d0s16 = vdup_n_s16(cospi_16_64);
+
+  q2s32 = vmull_s16(d16s16, d0s16);
+  q3s32 = vmull_s16(d17s16, d0s16);
+  q13s32 = vmull_s16(d16s16, d0s16);
+  q15s32 = vmull_s16(d17s16, d0s16);
+
+  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+  d0s16 = vdup_n_s16(cospi_24_64);
+  d1s16 = vdup_n_s16(cospi_8_64);
+
+  d18s16 = vqrshrn_n_s32(q2s32, 14);
+  d19s16 = vqrshrn_n_s32(q3s32, 14);
+  d22s16 = vqrshrn_n_s32(q13s32, 14);
+  d23s16 = vqrshrn_n_s32(q15s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+  *q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q2s32 = vmull_s16(d20s16, d0s16);
+  q3s32 = vmull_s16(d21s16, d0s16);
+  q8s32 = vmull_s16(d20s16, d1s16);
+  q12s32 = vmull_s16(d21s16, d1s16);
+
+  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+  d26s16 = vqrshrn_n_s32(q2s32, 14);
+  d27s16 = vqrshrn_n_s32(q3s32, 14);
+  d30s16 = vqrshrn_n_s32(q8s32, 14);
+  d31s16 = vqrshrn_n_s32(q12s32, 14);
+  *q13s16 = vcombine_s16(d26s16, d27s16);
+  *q15s16 = vcombine_s16(d30s16, d31s16);
+
+  q0s16 = vaddq_s16(*q9s16, *q15s16);
+  q1s16 = vaddq_s16(*q11s16, *q13s16);
+  q2s16 = vsubq_s16(*q11s16, *q13s16);
+  q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+  *q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  *q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+
+  d16s16 = vdup_n_s16(cospi_16_64);
+
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vqrshrn_n_s32(q9s32, 14);
+  d11s16 = vqrshrn_n_s32(q10s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  *q8s16 = vaddq_s16(q0s16, q7s16);
+  *q9s16 = vaddq_s16(q1s16, q6s16);
+  *q10s16 = vaddq_s16(q2s16, q5s16);
+  *q11s16 = vaddq_s16(q3s16, q4s16);
+  *q12s16 = vsubq_s16(q3s16, q4s16);
+  *q13s16 = vsubq_s16(q2s16, q5s16);
+  *q14s16 = vsubq_s16(q1s16, q6s16);
+  *q15s16 = vsubq_s16(q0s16, q7s16);
+  return;
 }
 
-void vpx_idct8x8_64_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8_t *d1, *d2;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8;
-    uint64x1_t d0u64, d1u64, d2u64, d3u64;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-    q8s16 = vld1q_s16(input);
-    q9s16 = vld1q_s16(input + 8);
-    q10s16 = vld1q_s16(input + 16);
-    q11s16 = vld1q_s16(input + 24);
-    q12s16 = vld1q_s16(input + 32);
-    q13s16 = vld1q_s16(input + 40);
-    q14s16 = vld1q_s16(input + 48);
-    q15s16 = vld1q_s16(input + 56);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
-               &q12s16, &q13s16, &q14s16, &q15s16);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
-               &q12s16, &q13s16, &q14s16, &q15s16);
-
-    q8s16 = vrshrq_n_s16(q8s16, 5);
-    q9s16 = vrshrq_n_s16(q9s16, 5);
-    q10s16 = vrshrq_n_s16(q10s16, 5);
-    q11s16 = vrshrq_n_s16(q11s16, 5);
-    q12s16 = vrshrq_n_s16(q12s16, 5);
-    q13s16 = vrshrq_n_s16(q13s16, 5);
-    q14s16 = vrshrq_n_s16(q14s16, 5);
-    q15s16 = vrshrq_n_s16(q15s16, 5);
-
-    d1 = d2 = dest;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-
-    q8s16 = q12s16;
-    q9s16 = q13s16;
-    q10s16 = q14s16;
-    q11s16 = q15s16;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    return;
+void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 16);
+  q11s16 = vld1q_s16(input + 24);
+  q12s16 = vld1q_s16(input + 32);
+  q13s16 = vld1q_s16(input + 40);
+  q14s16 = vld1q_s16(input + 48);
+  q15s16 = vld1q_s16(input + 56);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  d1 = d2 = dest;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+
+  q8s16 = q12s16;
+  q9s16 = q13s16;
+  q10s16 = q14s16;
+  q11s16 = q15s16;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+  return;
 }
 
-void vpx_idct8x8_12_add_neon(
-        int16_t *input,
-        uint8_t *dest,
-        int dest_stride) {
-    uint8_t *d1, *d2;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8;
-    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
-    int16x4_t d26s16, d27s16, d28s16, d29s16;
-    uint64x1_t d0u64, d1u64, d2u64, d3u64;
-    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16;
-    int32x4_t q9s32, q10s32, q11s32, q12s32;
-
-    q8s16 = vld1q_s16(input);
-    q9s16 = vld1q_s16(input + 8);
-    q10s16 = vld1q_s16(input + 16);
-    q11s16 = vld1q_s16(input + 24);
-    q12s16 = vld1q_s16(input + 32);
-    q13s16 = vld1q_s16(input + 40);
-    q14s16 = vld1q_s16(input + 48);
-    q15s16 = vld1q_s16(input + 56);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    // First transform rows
-    // stage 1
-    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
-    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
-
-    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
-    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
-
-    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-    q1s16 = vdupq_n_s16(cospi_12_64 * 2);
-
-    q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
-    q0s16 = vdupq_n_s16(cospi_16_64 * 2);
-
-    q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
-    // stage 2 & stage 3 - even half
-    q1s16 = vdupq_n_s16(cospi_24_64 * 2);
-
-    q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
-    q0s16 = vdupq_n_s16(cospi_8_64 * 2);
-
-    q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
-    q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
-    // stage 3 -odd half
-    q0s16 = vaddq_s16(q9s16, q15s16);
-    q1s16 = vaddq_s16(q9s16, q13s16);
-    q2s16 = vsubq_s16(q9s16, q13s16);
-    q3s16 = vsubq_s16(q9s16, q15s16);
-
-    // stage 2 - odd half
-    q13s16 = vsubq_s16(q4s16, q5s16);
-    q4s16 = vaddq_s16(q4s16, q5s16);
-    q14s16 = vsubq_s16(q7s16, q6s16);
-    q7s16 = vaddq_s16(q7s16, q6s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-
-    d16s16 = vdup_n_s16(cospi_16_64);
-    q9s32 = vmull_s16(d28s16, d16s16);
-    q10s32 = vmull_s16(d29s16, d16s16);
-    q11s32 = vmull_s16(d28s16, d16s16);
-    q12s32 = vmull_s16(d29s16, d16s16);
-
-    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
-    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-    d10s16 = vqrshrn_n_s32(q9s32, 14);
-    d11s16 = vqrshrn_n_s32(q10s32, 14);
-    d12s16 = vqrshrn_n_s32(q11s32, 14);
-    d13s16 = vqrshrn_n_s32(q12s32, 14);
-    q5s16 = vcombine_s16(d10s16, d11s16);
-    q6s16 = vcombine_s16(d12s16, d13s16);
-
-    // stage 4
-    q8s16 = vaddq_s16(q0s16, q7s16);
-    q9s16 = vaddq_s16(q1s16, q6s16);
-    q10s16 = vaddq_s16(q2s16, q5s16);
-    q11s16 = vaddq_s16(q3s16, q4s16);
-    q12s16 = vsubq_s16(q3s16, q4s16);
-    q13s16 = vsubq_s16(q2s16, q5s16);
-    q14s16 = vsubq_s16(q1s16, q6s16);
-    q15s16 = vsubq_s16(q0s16, q7s16);
-
-    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
-                 &q12s16, &q13s16, &q14s16, &q15s16);
-
-    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
-               &q12s16, &q13s16, &q14s16, &q15s16);
-
-    q8s16 = vrshrq_n_s16(q8s16, 5);
-    q9s16 = vrshrq_n_s16(q9s16, 5);
-    q10s16 = vrshrq_n_s16(q10s16, 5);
-    q11s16 = vrshrq_n_s16(q11s16, 5);
-    q12s16 = vrshrq_n_s16(q12s16, 5);
-    q13s16 = vrshrq_n_s16(q13s16, 5);
-    q14s16 = vrshrq_n_s16(q14s16, 5);
-    q15s16 = vrshrq_n_s16(q15s16, 5);
-
-    d1 = d2 = dest;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-
-    q8s16 = q12s16;
-    q9s16 = q13s16;
-    q10s16 = q14s16;
-    q11s16 = q15s16;
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
-                     vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
-                     vreinterpret_u8_u64(d1u64));
-    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
-                      vreinterpret_u8_u64(d2u64));
-    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
-                      vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    return;
+void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+  int16x4_t d26s16, d27s16, d28s16, d29s16;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+  int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 16);
+  q11s16 = vld1q_s16(input + 24);
+  q12s16 = vld1q_s16(input + 32);
+  q13s16 = vld1q_s16(input + 40);
+  q14s16 = vld1q_s16(input + 48);
+  q15s16 = vld1q_s16(input + 56);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  // First transform rows
+  // stage 1
+  q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+  q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+  q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+  q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+  // stage 2 & stage 3 - even half
+  q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+  q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+  // stage 3 -odd half
+  q0s16 = vaddq_s16(q9s16, q15s16);
+  q1s16 = vaddq_s16(q9s16, q13s16);
+  q2s16 = vsubq_s16(q9s16, q13s16);
+  q3s16 = vsubq_s16(q9s16, q15s16);
+
+  // stage 2 - odd half
+  q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+
+  d16s16 = vdup_n_s16(cospi_16_64);
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vqrshrn_n_s32(q9s32, 14);
+  d11s16 = vqrshrn_n_s32(q10s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 4
+  q8s16 = vaddq_s16(q0s16, q7s16);
+  q9s16 = vaddq_s16(q1s16, q6s16);
+  q10s16 = vaddq_s16(q2s16, q5s16);
+  q11s16 = vaddq_s16(q3s16, q4s16);
+  q12s16 = vsubq_s16(q3s16, q4s16);
+  q13s16 = vsubq_s16(q2s16, q5s16);
+  q14s16 = vsubq_s16(q1s16, q6s16);
+  q15s16 = vsubq_s16(q0s16, q7s16);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  d1 = d2 = dest;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+
+  q8s16 = q12s16;
+  q9s16 = q13s16;
+  q10s16 = q14s16;
+  q11s16 = q15s16;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+  return;
 }
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 0a376104d2bbc1f0d57c3e6062f95f4f91b53361..32dd1ba14606f25efabf12195ff882f147a31556 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -18,9 +18,8 @@
 // DC 4x4
 
 // 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *above, const uint8_t *left,
-                          int do_above, int do_left) {
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
   uint16x8_t sum_top;
   uint16x8_t sum_left;
   uint8x8_t dc0;
@@ -33,7 +32,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
   }
 
   if (do_left) {
-    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint8x8_t L = vld1_u8(left);   // left border
     const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
     const uint16x4_t p1 = vpadd_u16(p0, p0);
     sum_left = vcombine_u16(p1, p1);
@@ -54,7 +53,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
     int i;
     for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
     }
   }
 }
@@ -87,9 +86,8 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 // DC 8x8
 
 // 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
-                          const uint8_t *above, const uint8_t *left,
-                          int do_above, int do_left) {
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
   uint16x8_t sum_top;
   uint16x8_t sum_left;
   uint8x8_t dc0;
@@ -103,7 +101,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
   }
 
   if (do_left) {
-    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint8x8_t L = vld1_u8(left);   // left border
     const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
     const uint16x4_t p1 = vpadd_u16(p0, p0);
     const uint16x4_t p2 = vpadd_u16(p1, p1);
@@ -125,7 +123,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
     int i;
     for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
     }
   }
 }
@@ -167,7 +165,7 @@ static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
 
   if (do_above) {
     const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
     const uint16x4_t p2 = vpadd_u16(p1, p1);
     const uint16x4_t p3 = vpadd_u16(p2, p2);
@@ -425,8 +423,7 @@ void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   (void)left;
 
   d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride)
-    vst1_u8(dst, d0u8);
+  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
 }
 
 void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
@@ -436,8 +433,7 @@ void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   (void)left;
 
   q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride)
-    vst1q_u8(dst, q0u8);
+  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
 }
 
 void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
@@ -608,8 +604,8 @@ void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
   for (i = 0; i < 4; i++, dst += stride) {
     q1u16 = vdupq_n_u16((uint16_t)left[i]);
-    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
-                      vreinterpretq_s16_u16(q3u16));
+    q1s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
     d0u8 = vqmovun_s16(q1s16);
     vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
   }
@@ -631,26 +627,26 @@ void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   d20u16 = vget_low_u16(q10u16);
   for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
     q0u16 = vdupq_lane_u16(d20u16, 0);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
     d0u8 = vqmovun_s16(q0s16);
     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
     dst += stride;
     q0u16 = vdupq_lane_u16(d20u16, 1);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
     d0u8 = vqmovun_s16(q0s16);
     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
     dst += stride;
     q0u16 = vdupq_lane_u16(d20u16, 2);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
     d0u8 = vqmovun_s16(q0s16);
     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
     dst += stride;
     q0u16 = vdupq_lane_u16(d20u16, 3);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                      vreinterpretq_s16_u16(q0u16));
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
     d0u8 = vqmovun_s16(q0s16);
     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
     dst += stride;
@@ -677,14 +673,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
     for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
       q0u16 = vdupq_lane_u16(d20u16, 0);
       q8u16 = vdupq_lane_u16(d20u16, 1);
-      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q2u16));
-      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q3u16));
-      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                         vreinterpretq_s16_u16(q2u16));
-      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                        vreinterpretq_s16_u16(q3u16));
+      q1s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+      q0s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+      q11s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+      q8s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
       d2u8 = vqmovun_s16(q1s16);
       d3u8 = vqmovun_s16(q0s16);
       d22u8 = vqmovun_s16(q11s16);
@@ -698,14 +694,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 
       q0u16 = vdupq_lane_u16(d20u16, 2);
       q8u16 = vdupq_lane_u16(d20u16, 3);
-      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q2u16));
-      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                        vreinterpretq_s16_u16(q3u16));
-      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                         vreinterpretq_s16_u16(q2u16));
-      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                        vreinterpretq_s16_u16(q3u16));
+      q1s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+      q0s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+      q11s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+      q8s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
       d2u8 = vqmovun_s16(q1s16);
       d3u8 = vqmovun_s16(q0s16);
       d22u8 = vqmovun_s16(q11s16);
@@ -742,10 +738,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
     d6u16 = vget_low_u16(q3u16);
     for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
       q0u16 = vdupq_lane_u16(d6u16, 0);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                          vreinterpretq_s16_u16(q10u16));
       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -761,10 +757,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
       dst += stride;
 
       q0u16 = vdupq_lane_u16(d6u16, 1);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                          vreinterpretq_s16_u16(q10u16));
       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -780,10 +776,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
       dst += stride;
 
       q0u16 = vdupq_lane_u16(d6u16, 2);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                          vreinterpretq_s16_u16(q10u16));
       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -799,10 +795,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
       dst += stride;
 
       q0u16 = vdupq_lane_u16(d6u16, 3);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q9u16));
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                          vreinterpretq_s16_u16(q10u16));
       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
diff --git a/vpx_dsp/arm/loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c
index d24e6adc8a64a7bc72a91b5eecc19730c9288758..9607bb24056b4cc39212a5781ba812da10fbec3b 100644
--- a/vpx_dsp/arm/loopfilter_16_neon.c
+++ b/vpx_dsp/arm/loopfilter_16_neon.c
@@ -14,166 +14,160 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE void loop_filter_neon_16(
-        uint8x16_t qblimit,  // blimit
-        uint8x16_t qlimit,   // limit
-        uint8x16_t qthresh,  // thresh
-        uint8x16_t q3,       // p3
-        uint8x16_t q4,       // p2
-        uint8x16_t q5,       // p1
-        uint8x16_t q6,       // p0
-        uint8x16_t q7,       // q0
-        uint8x16_t q8,       // q1
-        uint8x16_t q9,       // q2
-        uint8x16_t q10,      // q3
-        uint8x16_t *q5r,     // p1
-        uint8x16_t *q6r,     // p0
-        uint8x16_t *q7r,     // q0
-        uint8x16_t *q8r) {   // q1
-    uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-    int16x8_t q2s16, q11s16;
-    uint16x8_t q4u16;
-    int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
-    int8x8_t d2s8, d3s8;
-
-    q11u8 = vabdq_u8(q3, q4);
-    q12u8 = vabdq_u8(q4, q5);
-    q13u8 = vabdq_u8(q5, q6);
-    q14u8 = vabdq_u8(q8, q7);
-    q3 = vabdq_u8(q9, q8);
-    q4 = vabdq_u8(q10, q9);
-
-    q11u8 = vmaxq_u8(q11u8, q12u8);
-    q12u8 = vmaxq_u8(q13u8, q14u8);
-    q3 = vmaxq_u8(q3, q4);
-    q15u8 = vmaxq_u8(q11u8, q12u8);
-
-    q9 = vabdq_u8(q6, q7);
-
-    // vp8_hevmask
-    q13u8 = vcgtq_u8(q13u8, qthresh);
-    q14u8 = vcgtq_u8(q14u8, qthresh);
-    q15u8 = vmaxq_u8(q15u8, q3);
-
-    q2u8 = vabdq_u8(q5, q8);
-    q9 = vqaddq_u8(q9, q9);
-
-    q15u8 = vcgeq_u8(qlimit, q15u8);
-
-    // vp8_filter() function
-    // convert to signed
-    q10 = vdupq_n_u8(0x80);
-    q8 = veorq_u8(q8, q10);
-    q7 = veorq_u8(q7, q10);
-    q6 = veorq_u8(q6, q10);
-    q5 = veorq_u8(q5, q10);
-
-    q2u8 = vshrq_n_u8(q2u8, 1);
-    q9 = vqaddq_u8(q9, q2u8);
-
-    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
-                     vget_low_s8(vreinterpretq_s8_u8(q6)));
-    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
-                      vget_high_s8(vreinterpretq_s8_u8(q6)));
-
-    q9 = vcgeq_u8(qblimit, q9);
-
-    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
-                    vreinterpretq_s8_u8(q8));
-
-    q14u8 = vorrq_u8(q13u8, q14u8);
-
-    q4u16 = vdupq_n_u16(3);
-    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
-    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
-    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
-    q15u8 = vandq_u8(q15u8, q9);
-
-    q1s8 = vreinterpretq_s8_u8(q1u8);
-    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
-    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
-    q4 = vdupq_n_u8(3);
-    q9 = vdupq_n_u8(4);
-    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
-    d2s8 = vqmovn_s16(q2s16);
-    d3s8 = vqmovn_s16(q11s16);
-    q1s8 = vcombine_s8(d2s8, d3s8);
-    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
-    q1s8 = vreinterpretq_s8_u8(q1u8);
-
-    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
-    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
-    q2s8 = vshrq_n_s8(q2s8, 3);
-    q1s8 = vshrq_n_s8(q1s8, 3);
-
-    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
-    q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
-    q1s8 = vrshrq_n_s8(q1s8, 1);
-    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
-    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
-    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
-    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
-    *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8),  q10);
-    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
-    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
-    return;
+static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
+                                       uint8x16_t qlimit,   // limit
+                                       uint8x16_t qthresh,  // thresh
+                                       uint8x16_t q3,       // p3
+                                       uint8x16_t q4,       // p2
+                                       uint8x16_t q5,       // p1
+                                       uint8x16_t q6,       // p0
+                                       uint8x16_t q7,       // q0
+                                       uint8x16_t q8,       // q1
+                                       uint8x16_t q9,       // q2
+                                       uint8x16_t q10,      // q3
+                                       uint8x16_t *q5r,     // p1
+                                       uint8x16_t *q6r,     // p0
+                                       uint8x16_t *q7r,     // q0
+                                       uint8x16_t *q8r) {   // q1
+  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+  int16x8_t q2s16, q11s16;
+  uint16x8_t q4u16;
+  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+  int8x8_t d2s8, d3s8;
+
+  q11u8 = vabdq_u8(q3, q4);
+  q12u8 = vabdq_u8(q4, q5);
+  q13u8 = vabdq_u8(q5, q6);
+  q14u8 = vabdq_u8(q8, q7);
+  q3 = vabdq_u8(q9, q8);
+  q4 = vabdq_u8(q10, q9);
+
+  q11u8 = vmaxq_u8(q11u8, q12u8);
+  q12u8 = vmaxq_u8(q13u8, q14u8);
+  q3 = vmaxq_u8(q3, q4);
+  q15u8 = vmaxq_u8(q11u8, q12u8);
+
+  q9 = vabdq_u8(q6, q7);
+
+  // vp8_hevmask
+  q13u8 = vcgtq_u8(q13u8, qthresh);
+  q14u8 = vcgtq_u8(q14u8, qthresh);
+  q15u8 = vmaxq_u8(q15u8, q3);
+
+  q2u8 = vabdq_u8(q5, q8);
+  q9 = vqaddq_u8(q9, q9);
+
+  q15u8 = vcgeq_u8(qlimit, q15u8);
+
+  // vp8_filter() function
+  // convert to signed
+  q10 = vdupq_n_u8(0x80);
+  q8 = veorq_u8(q8, q10);
+  q7 = veorq_u8(q7, q10);
+  q6 = veorq_u8(q6, q10);
+  q5 = veorq_u8(q5, q10);
+
+  q2u8 = vshrq_n_u8(q2u8, 1);
+  q9 = vqaddq_u8(q9, q2u8);
+
+  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                   vget_low_s8(vreinterpretq_s8_u8(q6)));
+  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                    vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+  q9 = vcgeq_u8(qblimit, q9);
+
+  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
+
+  q14u8 = vorrq_u8(q13u8, q14u8);
+
+  q4u16 = vdupq_n_u16(3);
+  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+  q15u8 = vandq_u8(q15u8, q9);
+
+  q1s8 = vreinterpretq_s8_u8(q1u8);
+  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+  q4 = vdupq_n_u8(3);
+  q9 = vdupq_n_u8(4);
+  // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+  d2s8 = vqmovn_s16(q2s16);
+  d3s8 = vqmovn_s16(q11s16);
+  q1s8 = vcombine_s8(d2s8, d3s8);
+  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+  q1s8 = vreinterpretq_s8_u8(q1u8);
+
+  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+  q2s8 = vshrq_n_s8(q2s8, 3);
+  q1s8 = vshrq_n_s8(q1s8, 3);
+
+  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+  q1s8 = vrshrq_n_s8(q1s8, 1);
+  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+  return;
 }
 
-void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
-                                    const uint8_t *blimit0,
-                                    const uint8_t *limit0,
-                                    const uint8_t *thresh0,
-                                    const uint8_t *blimit1,
-                                    const uint8_t *limit1,
-                                    const uint8_t *thresh1) {
-    uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
-    uint8x16_t qblimit, qlimit, qthresh;
-    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
-    dblimit0 = vld1_u8(blimit0);
-    dlimit0 = vld1_u8(limit0);
-    dthresh0 = vld1_u8(thresh0);
-    dblimit1 = vld1_u8(blimit1);
-    dlimit1 = vld1_u8(limit1);
-    dthresh1 = vld1_u8(thresh1);
-    qblimit = vcombine_u8(dblimit0, dblimit1);
-    qlimit = vcombine_u8(dlimit0, dlimit1);
-    qthresh = vcombine_u8(dthresh0, dthresh1);
-
-    s -= (p << 2);
-
-    q3u8 = vld1q_u8(s);
-    s += p;
-    q4u8 = vld1q_u8(s);
-    s += p;
-    q5u8 = vld1q_u8(s);
-    s += p;
-    q6u8 = vld1q_u8(s);
-    s += p;
-    q7u8 = vld1q_u8(s);
-    s += p;
-    q8u8 = vld1q_u8(s);
-    s += p;
-    q9u8 = vld1q_u8(s);
-    s += p;
-    q10u8 = vld1q_u8(s);
-
-    loop_filter_neon_16(qblimit, qlimit, qthresh,
-                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
-                        &q5u8, &q6u8, &q7u8, &q8u8);
-
-    s -= (p * 5);
-    vst1q_u8(s, q5u8);
-    s += p;
-    vst1q_u8(s, q6u8);
-    s += p;
-    vst1q_u8(s, q7u8);
-    s += p;
-    vst1q_u8(s, q8u8);
-    return;
+void vpx_lpf_horizontal_4_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+  uint8x16_t qblimit, qlimit, qthresh;
+  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+  dblimit0 = vld1_u8(blimit0);
+  dlimit0 = vld1_u8(limit0);
+  dthresh0 = vld1_u8(thresh0);
+  dblimit1 = vld1_u8(blimit1);
+  dlimit1 = vld1_u8(limit1);
+  dthresh1 = vld1_u8(thresh1);
+  qblimit = vcombine_u8(dblimit0, dblimit1);
+  qlimit = vcombine_u8(dlimit0, dlimit1);
+  qthresh = vcombine_u8(dthresh0, dthresh1);
+
+  s -= (p << 2);
+
+  q3u8 = vld1q_u8(s);
+  s += p;
+  q4u8 = vld1q_u8(s);
+  s += p;
+  q5u8 = vld1q_u8(s);
+  s += p;
+  q6u8 = vld1q_u8(s);
+  s += p;
+  q7u8 = vld1q_u8(s);
+  s += p;
+  q8u8 = vld1q_u8(s);
+  s += p;
+  q9u8 = vld1q_u8(s);
+  s += p;
+  q10u8 = vld1q_u8(s);
+
+  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
+                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
+
+  s -= (p * 5);
+  vst1q_u8(s, q5u8);
+  s += p;
+  vst1q_u8(s, q6u8);
+  s += p;
+  vst1q_u8(s, q7u8);
+  s += p;
+  vst1q_u8(s, q8u8);
+  return;
 }
diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
index 7f3ee70b94873aef38a4d37efe4909e732192f49..1c1e80e00088ee801ab11665ac758438a3367925 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c
@@ -12,255 +12,238 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-static INLINE void loop_filter_neon(
-        uint8x8_t dblimit,    // flimit
-        uint8x8_t dlimit,     // limit
-        uint8x8_t dthresh,    // thresh
-        uint8x8_t d3u8,       // p3
-        uint8x8_t d4u8,       // p2
-        uint8x8_t d5u8,       // p1
-        uint8x8_t d6u8,       // p0
-        uint8x8_t d7u8,       // q0
-        uint8x8_t d16u8,      // q1
-        uint8x8_t d17u8,      // q2
-        uint8x8_t d18u8,      // q3
-        uint8x8_t *d4ru8,     // p1
-        uint8x8_t *d5ru8,     // p0
-        uint8x8_t *d6ru8,     // q0
-        uint8x8_t *d7ru8) {   // q1
-    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
-    int16x8_t q12s16;
-    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
-    d19u8 = vabd_u8(d3u8, d4u8);
-    d20u8 = vabd_u8(d4u8, d5u8);
-    d21u8 = vabd_u8(d5u8, d6u8);
-    d22u8 = vabd_u8(d16u8, d7u8);
-    d3u8  = vabd_u8(d17u8, d16u8);
-    d4u8  = vabd_u8(d18u8, d17u8);
-
-    d19u8 = vmax_u8(d19u8, d20u8);
-    d20u8 = vmax_u8(d21u8, d22u8);
-    d3u8  = vmax_u8(d3u8,  d4u8);
-    d23u8 = vmax_u8(d19u8, d20u8);
-
-    d17u8 = vabd_u8(d6u8, d7u8);
-
-    d21u8 = vcgt_u8(d21u8, dthresh);
-    d22u8 = vcgt_u8(d22u8, dthresh);
-    d23u8 = vmax_u8(d23u8, d3u8);
-
-    d28u8 = vabd_u8(d5u8, d16u8);
-    d17u8 = vqadd_u8(d17u8, d17u8);
-
-    d23u8 = vcge_u8(dlimit, d23u8);
-
-    d18u8 = vdup_n_u8(0x80);
-    d5u8  = veor_u8(d5u8,  d18u8);
-    d6u8  = veor_u8(d6u8,  d18u8);
-    d7u8  = veor_u8(d7u8,  d18u8);
-    d16u8 = veor_u8(d16u8, d18u8);
-
-    d28u8 = vshr_n_u8(d28u8, 1);
-    d17u8 = vqadd_u8(d17u8, d28u8);
-
-    d19u8 = vdup_n_u8(3);
-
-    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
-                    vreinterpret_s8_u8(d6u8));
-
-    d17u8 = vcge_u8(dblimit, d17u8);
-
-    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
-                     vreinterpret_s8_u8(d16u8));
-
-    d22u8 = vorr_u8(d21u8, d22u8);
-
-    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
-    d23u8 = vand_u8(d23u8, d17u8);
-
-    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
-    d17u8 = vdup_n_u8(4);
-
-    d27s8 = vqmovn_s16(q12s16);
-    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
-    d27s8 = vreinterpret_s8_u8(d27u8);
-
-    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
-    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
-    d28s8 = vshr_n_s8(d28s8, 3);
-    d27s8 = vshr_n_s8(d27s8, 3);
-
-    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
-    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
-    d27s8 = vrshr_n_s8(d27s8, 1);
-    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
-    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
-    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
-    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
-    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
-    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
-    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
-    return;
+static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
+                                    uint8x8_t dlimit,    // limit
+                                    uint8x8_t dthresh,   // thresh
+                                    uint8x8_t d3u8,      // p3
+                                    uint8x8_t d4u8,      // p2
+                                    uint8x8_t d5u8,      // p1
+                                    uint8x8_t d6u8,      // p0
+                                    uint8x8_t d7u8,      // q0
+                                    uint8x8_t d16u8,     // q1
+                                    uint8x8_t d17u8,     // q2
+                                    uint8x8_t d18u8,     // q3
+                                    uint8x8_t *d4ru8,    // p1
+                                    uint8x8_t *d5ru8,    // p0
+                                    uint8x8_t *d6ru8,    // q0
+                                    uint8x8_t *d7ru8) {  // q1
+  uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+  int16x8_t q12s16;
+  int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+  d19u8 = vabd_u8(d3u8, d4u8);
+  d20u8 = vabd_u8(d4u8, d5u8);
+  d21u8 = vabd_u8(d5u8, d6u8);
+  d22u8 = vabd_u8(d16u8, d7u8);
+  d3u8 = vabd_u8(d17u8, d16u8);
+  d4u8 = vabd_u8(d18u8, d17u8);
+
+  d19u8 = vmax_u8(d19u8, d20u8);
+  d20u8 = vmax_u8(d21u8, d22u8);
+  d3u8 = vmax_u8(d3u8, d4u8);
+  d23u8 = vmax_u8(d19u8, d20u8);
+
+  d17u8 = vabd_u8(d6u8, d7u8);
+
+  d21u8 = vcgt_u8(d21u8, dthresh);
+  d22u8 = vcgt_u8(d22u8, dthresh);
+  d23u8 = vmax_u8(d23u8, d3u8);
+
+  d28u8 = vabd_u8(d5u8, d16u8);
+  d17u8 = vqadd_u8(d17u8, d17u8);
+
+  d23u8 = vcge_u8(dlimit, d23u8);
+
+  d18u8 = vdup_n_u8(0x80);
+  d5u8 = veor_u8(d5u8, d18u8);
+  d6u8 = veor_u8(d6u8, d18u8);
+  d7u8 = veor_u8(d7u8, d18u8);
+  d16u8 = veor_u8(d16u8, d18u8);
+
+  d28u8 = vshr_n_u8(d28u8, 1);
+  d17u8 = vqadd_u8(d17u8, d28u8);
+
+  d19u8 = vdup_n_u8(3);
+
+  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
+
+  d17u8 = vcge_u8(dblimit, d17u8);
+
+  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
+
+  d22u8 = vorr_u8(d21u8, d22u8);
+
+  q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+  d23u8 = vand_u8(d23u8, d17u8);
+
+  q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+  d17u8 = vdup_n_u8(4);
+
+  d27s8 = vqmovn_s16(q12s16);
+  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+  d27s8 = vreinterpret_s8_u8(d27u8);
+
+  d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+  d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+  d28s8 = vshr_n_s8(d28s8, 3);
+  d27s8 = vshr_n_s8(d27s8, 3);
+
+  d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+  d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+  d27s8 = vrshr_n_s8(d27s8, 1);
+  d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+  d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+  d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+  *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+  *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+  *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+  *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+  return;
 }
 
-void vpx_lpf_horizontal_4_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i;
-    uint8_t *s, *psrc;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    psrc = src - (pitch << 2);
-    for (i = 0; i < 1; i++) {
-        s = psrc + i * 8;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        loop_filter_neon(dblimit, dlimit, dthresh,
-                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                         &d4u8, &d5u8, &d6u8, &d7u8);
-
-        s -= (pitch * 5);
-        vst1_u8(s, d4u8);
-        s += pitch;
-        vst1_u8(s, d5u8);
-        s += pitch;
-        vst1_u8(s, d6u8);
-        s += pitch;
-        vst1_u8(s, d7u8);
-    }
-    return;
+void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s, *psrc;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  psrc = src - (pitch << 2);
+  for (i = 0; i < 1; i++) {
+    s = psrc + i * 8;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+    s -= (pitch * 5);
+    vst1_u8(s, d4u8);
+    s += pitch;
+    vst1_u8(s, d5u8);
+    s += pitch;
+    vst1_u8(s, d6u8);
+    s += pitch;
+    vst1_u8(s, d7u8);
+  }
+  return;
 }
 
-void vpx_lpf_vertical_4_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i, pitch8;
-    uint8_t *s;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-    uint8x8x4_t d4Result;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    pitch8 = pitch * 8;
-    for (i = 0; i < 1; i++, src += pitch8) {
-        s = src - (i + 1) * 4;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
-                      vreinterpret_u32_u8(d7u8));
-        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
-                      vreinterpret_u32_u8(d16u8));
-        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
-                      vreinterpret_u32_u8(d17u8));
-        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
-                      vreinterpret_u32_u8(d18u8));
-
-        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                          vreinterpret_u16_u32(d2tmp2.val[0]));
-        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                          vreinterpret_u16_u32(d2tmp3.val[0]));
-        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                          vreinterpret_u16_u32(d2tmp2.val[1]));
-        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                          vreinterpret_u16_u32(d2tmp3.val[1]));
-
-        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                         vreinterpret_u8_u16(d2tmp5.val[0]));
-        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                         vreinterpret_u8_u16(d2tmp5.val[1]));
-        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                          vreinterpret_u8_u16(d2tmp7.val[0]));
-        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                          vreinterpret_u8_u16(d2tmp7.val[1]));
-
-        d3u8 = d2tmp8.val[0];
-        d4u8 = d2tmp8.val[1];
-        d5u8 = d2tmp9.val[0];
-        d6u8 = d2tmp9.val[1];
-        d7u8 = d2tmp10.val[0];
-        d16u8 = d2tmp10.val[1];
-        d17u8 = d2tmp11.val[0];
-        d18u8 = d2tmp11.val[1];
-
-        loop_filter_neon(dblimit, dlimit, dthresh,
-                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                         &d4u8, &d5u8, &d6u8, &d7u8);
-
-        d4Result.val[0] = d4u8;
-        d4Result.val[1] = d5u8;
-        d4Result.val[2] = d6u8;
-        d4Result.val[3] = d7u8;
-
-        src -= 2;
-        vst4_lane_u8(src, d4Result, 0);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 1);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 2);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 3);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 4);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 5);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 6);
-        src += pitch;
-        vst4_lane_u8(src, d4Result, 7);
-    }
-    return;
+void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  int i, pitch8;
+  uint8_t *s;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+  uint8x8x4_t d4Result;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  pitch8 = pitch * 8;
+  for (i = 0; i < 1; i++, src += pitch8) {
+    s = src - (i + 1) * 4;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                      vreinterpret_u16_u32(d2tmp2.val[0]));
+    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                      vreinterpret_u16_u32(d2tmp3.val[0]));
+    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                      vreinterpret_u16_u32(d2tmp2.val[1]));
+    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                      vreinterpret_u16_u32(d2tmp3.val[1]));
+
+    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                     vreinterpret_u8_u16(d2tmp5.val[0]));
+    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                     vreinterpret_u8_u16(d2tmp5.val[1]));
+    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                      vreinterpret_u8_u16(d2tmp7.val[0]));
+    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                      vreinterpret_u8_u16(d2tmp7.val[1]));
+
+    d3u8 = d2tmp8.val[0];
+    d4u8 = d2tmp8.val[1];
+    d5u8 = d2tmp9.val[0];
+    d6u8 = d2tmp9.val[1];
+    d7u8 = d2tmp10.val[0];
+    d16u8 = d2tmp10.val[1];
+    d17u8 = d2tmp11.val[0];
+    d18u8 = d2tmp11.val[1];
+
+    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+    d4Result.val[0] = d4u8;
+    d4Result.val[1] = d5u8;
+    d4Result.val[2] = d6u8;
+    d4Result.val[3] = d7u8;
+
+    src -= 2;
+    vst4_lane_u8(src, d4Result, 0);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 1);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 2);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 3);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 4);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 5);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 6);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 7);
+  }
+  return;
 }
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
index ec3757380d572f047145d751575ed80c336f4f44..854196f4272e692bc6f85333095c5139f7950032 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -12,434 +12,418 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-static INLINE void mbloop_filter_neon(
-        uint8x8_t dblimit,   // mblimit
-        uint8x8_t dlimit,    // limit
-        uint8x8_t dthresh,   // thresh
-        uint8x8_t d3u8,      // p2
-        uint8x8_t d4u8,      // p2
-        uint8x8_t d5u8,      // p1
-        uint8x8_t d6u8,      // p0
-        uint8x8_t d7u8,      // q0
-        uint8x8_t d16u8,     // q1
-        uint8x8_t d17u8,     // q2
-        uint8x8_t d18u8,     // q3
-        uint8x8_t *d0ru8,    // p1
-        uint8x8_t *d1ru8,    // p1
-        uint8x8_t *d2ru8,    // p0
-        uint8x8_t *d3ru8,    // q0
-        uint8x8_t *d4ru8,    // q1
-        uint8x8_t *d5ru8) {  // q1
-    uint32_t flat;
-    uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
-    uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-    int16x8_t q15s16;
-    uint16x8_t q10u16, q14u16;
-    int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
+                                      uint8x8_t dlimit,    // limit
+                                      uint8x8_t dthresh,   // thresh
+                                      uint8x8_t d3u8,      // p2
+                                      uint8x8_t d4u8,      // p2
+                                      uint8x8_t d5u8,      // p1
+                                      uint8x8_t d6u8,      // p0
+                                      uint8x8_t d7u8,      // q0
+                                      uint8x8_t d16u8,     // q1
+                                      uint8x8_t d17u8,     // q2
+                                      uint8x8_t d18u8,     // q3
+                                      uint8x8_t *d0ru8,    // p1
+                                      uint8x8_t *d1ru8,    // p1
+                                      uint8x8_t *d2ru8,    // p0
+                                      uint8x8_t *d3ru8,    // q0
+                                      uint8x8_t *d4ru8,    // q1
+                                      uint8x8_t *d5ru8) {  // q1
+  uint32_t flat;
+  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+  int16x8_t q15s16;
+  uint16x8_t q10u16, q14u16;
+  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
 
-    d19u8 = vabd_u8(d3u8, d4u8);
-    d20u8 = vabd_u8(d4u8, d5u8);
-    d21u8 = vabd_u8(d5u8, d6u8);
-    d22u8 = vabd_u8(d16u8, d7u8);
-    d23u8 = vabd_u8(d17u8, d16u8);
-    d24u8 = vabd_u8(d18u8, d17u8);
+  d19u8 = vabd_u8(d3u8, d4u8);
+  d20u8 = vabd_u8(d4u8, d5u8);
+  d21u8 = vabd_u8(d5u8, d6u8);
+  d22u8 = vabd_u8(d16u8, d7u8);
+  d23u8 = vabd_u8(d17u8, d16u8);
+  d24u8 = vabd_u8(d18u8, d17u8);
 
-    d19u8 = vmax_u8(d19u8, d20u8);
-    d20u8 = vmax_u8(d21u8, d22u8);
+  d19u8 = vmax_u8(d19u8, d20u8);
+  d20u8 = vmax_u8(d21u8, d22u8);
 
-    d25u8 = vabd_u8(d6u8, d4u8);
+  d25u8 = vabd_u8(d6u8, d4u8);
 
-    d23u8 = vmax_u8(d23u8, d24u8);
+  d23u8 = vmax_u8(d23u8, d24u8);
 
-    d26u8 = vabd_u8(d7u8, d17u8);
+  d26u8 = vabd_u8(d7u8, d17u8);
 
-    d19u8 = vmax_u8(d19u8, d20u8);
+  d19u8 = vmax_u8(d19u8, d20u8);
 
-    d24u8 = vabd_u8(d6u8, d7u8);
-    d27u8 = vabd_u8(d3u8, d6u8);
-    d28u8 = vabd_u8(d18u8, d7u8);
+  d24u8 = vabd_u8(d6u8, d7u8);
+  d27u8 = vabd_u8(d3u8, d6u8);
+  d28u8 = vabd_u8(d18u8, d7u8);
 
-    d19u8 = vmax_u8(d19u8, d23u8);
+  d19u8 = vmax_u8(d19u8, d23u8);
 
-    d23u8 = vabd_u8(d5u8, d16u8);
-    d24u8 = vqadd_u8(d24u8, d24u8);
+  d23u8 = vabd_u8(d5u8, d16u8);
+  d24u8 = vqadd_u8(d24u8, d24u8);
 
+  d19u8 = vcge_u8(dlimit, d19u8);
 
-    d19u8 = vcge_u8(dlimit, d19u8);
+  d25u8 = vmax_u8(d25u8, d26u8);
+  d26u8 = vmax_u8(d27u8, d28u8);
 
+  d23u8 = vshr_n_u8(d23u8, 1);
 
-    d25u8 = vmax_u8(d25u8, d26u8);
-    d26u8 = vmax_u8(d27u8, d28u8);
+  d25u8 = vmax_u8(d25u8, d26u8);
 
-    d23u8 = vshr_n_u8(d23u8, 1);
+  d24u8 = vqadd_u8(d24u8, d23u8);
 
-    d25u8 = vmax_u8(d25u8, d26u8);
+  d20u8 = vmax_u8(d20u8, d25u8);
 
-    d24u8 = vqadd_u8(d24u8, d23u8);
+  d23u8 = vdup_n_u8(1);
+  d24u8 = vcge_u8(dblimit, d24u8);
 
-    d20u8 = vmax_u8(d20u8, d25u8);
+  d21u8 = vcgt_u8(d21u8, dthresh);
 
-    d23u8 = vdup_n_u8(1);
-    d24u8 = vcge_u8(dblimit, d24u8);
+  d20u8 = vcge_u8(d23u8, d20u8);
 
-    d21u8 = vcgt_u8(d21u8, dthresh);
+  d19u8 = vand_u8(d19u8, d24u8);
 
-    d20u8 = vcge_u8(d23u8, d20u8);
+  d23u8 = vcgt_u8(d22u8, dthresh);
 
-    d19u8 = vand_u8(d19u8, d24u8);
+  d20u8 = vand_u8(d20u8, d19u8);
 
-    d23u8 = vcgt_u8(d22u8, dthresh);
+  d22u8 = vdup_n_u8(0x80);
 
-    d20u8 = vand_u8(d20u8, d19u8);
+  d23u8 = vorr_u8(d21u8, d23u8);
 
-    d22u8 = vdup_n_u8(0x80);
+  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
 
-    d23u8 = vorr_u8(d21u8, d23u8);
+  d30u8 = vshrn_n_u16(q10u16, 4);
+  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
 
-    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
-                          vreinterpret_u16_u8(d21u8));
+  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
+    d27u8 = vdup_n_u8(3);
+    d21u8 = vdup_n_u8(2);
+    q14u16 = vaddl_u8(d6u8, d7u8);
+    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    *d0ru8 = vqrshrn_n_u16(q14u16, 3);
 
-    d30u8 = vshrn_n_u16(q10u16, 4);
-    flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+    *d1ru8 = vqrshrn_n_u16(q14u16, 3);
 
-    if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
-        d27u8 = vdup_n_u8(3);
-        d21u8 = vdup_n_u8(2);
-        q14u16 = vaddl_u8(d6u8, d7u8);
-        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-        q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    *d2ru8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-        *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d3ru8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vsubw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d4ru8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vsubw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+  } else {
+    d21u8 = veor_u8(d7u8, d22u8);
+    d24u8 = veor_u8(d6u8, d22u8);
+    d25u8 = veor_u8(d5u8, d22u8);
+    d26u8 = veor_u8(d16u8, d22u8);
 
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vsubw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+    d27u8 = vdup_n_u8(3);
 
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vsubw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
-        *d5ru8 = vqrshrn_n_u16(q14u16, 3);
-    } else {
-        d21u8 = veor_u8(d7u8,  d22u8);
-        d24u8 = veor_u8(d6u8,  d22u8);
-        d25u8 = veor_u8(d5u8,  d22u8);
-        d26u8 = veor_u8(d16u8, d22u8);
+    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
 
-        d27u8 = vdup_n_u8(3);
+    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
 
-        d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
-        d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
 
-        q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+    q15s16 = vaddw_s8(q15s16, d29s8);
 
-        d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+    d29u8 = vdup_n_u8(4);
 
-        q15s16 = vaddw_s8(q15s16, d29s8);
+    d28s8 = vqmovn_s16(q15s16);
 
-        d29u8 = vdup_n_u8(4);
+    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
 
-        d28s8 = vqmovn_s16(q15s16);
+    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+    d30s8 = vshr_n_s8(d30s8, 3);
+    d29s8 = vshr_n_s8(d29s8, 3);
 
-        d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
 
-        d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
-        d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
-        d30s8 = vshr_n_s8(d30s8, 3);
-        d29s8 = vshr_n_s8(d29s8, 3);
+    d29s8 = vrshr_n_s8(d29s8, 1);
+    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
 
-        d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
-        d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
 
-        d29s8 = vrshr_n_s8(d29s8, 1);
-        d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-        d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
-        d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
-        if (flat == 0) {  // filter_branch_only
-            *d0ru8 = d4u8;
-            *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-            *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-            *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-            *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-            *d5ru8 = d17u8;
-            return;
-        }
+    if (flat == 0) {  // filter_branch_only
+      *d0ru8 = d4u8;
+      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+      *d5ru8 = d17u8;
+      return;
+    }
 
-        d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-        d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-        d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-        d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
 
-        d23u8 = vdup_n_u8(2);
-        q14u16 = vaddl_u8(d6u8, d7u8);
-        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-        q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+    d23u8 = vdup_n_u8(2);
+    q14u16 = vaddl_u8(d6u8, d7u8);
+    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
 
-        d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
 
-        q14u16 = vaddw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
 
-        d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
 
-        d30u8 = vqrshrn_n_u16(q14u16, 3);
+    d30u8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vaddw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
 
-        d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
 
-        d31u8 = vqrshrn_n_u16(q14u16, 3);
+    d31u8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vaddw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
 
-        *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
 
-        d23u8 = vqrshrn_n_u16(q14u16, 3);
+    d23u8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d3u8);
-        q14u16 = vsubw_u8(q14u16, d6u8);
-        q14u16 = vaddw_u8(q14u16, d7u8);
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d7u8);
 
-        *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
 
-        q14u16 = vaddw_u8(q14u16, d18u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
 
-        *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
 
-        d22u8 = vqrshrn_n_u16(q14u16, 3);
+    d22u8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d4u8);
-        q14u16 = vsubw_u8(q14u16, d7u8);
-        q14u16 = vaddw_u8(q14u16, d16u8);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vsubw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
 
-        d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
 
-        q14u16 = vaddw_u8(q14u16, d18u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
 
-        d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
 
-        d6u8 = vqrshrn_n_u16(q14u16, 3);
+    d6u8 = vqrshrn_n_u16(q14u16, 3);
 
-        q14u16 = vsubw_u8(q14u16, d5u8);
-        q14u16 = vsubw_u8(q14u16, d16u8);
-        q14u16 = vaddw_u8(q14u16, d17u8);
-        q14u16 = vaddw_u8(q14u16, d18u8);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vsubw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
 
-        d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
 
-        d7u8 = vqrshrn_n_u16(q14u16, 3);
+    d7u8 = vqrshrn_n_u16(q14u16, 3);
 
-        *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
-        *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
-        *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
-    }
-    return;
+    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+  }
+  return;
 }
 
-void vpx_lpf_horizontal_8_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i;
-    uint8_t *s, *psrc;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint8x8_t d16u8, d17u8, d18u8;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    psrc = src - (pitch << 2);
-    for (i = 0; i < 1; i++) {
-        s = psrc + i * 8;
-
-        d3u8  = vld1_u8(s);
-        s += pitch;
-        d4u8  = vld1_u8(s);
-        s += pitch;
-        d5u8  = vld1_u8(s);
-        s += pitch;
-        d6u8  = vld1_u8(s);
-        s += pitch;
-        d7u8  = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        mbloop_filter_neon(dblimit, dlimit, dthresh,
-                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
-        s -= (pitch * 6);
-        vst1_u8(s, d0u8);
-        s += pitch;
-        vst1_u8(s, d1u8);
-        s += pitch;
-        vst1_u8(s, d2u8);
-        s += pitch;
-        vst1_u8(s, d3u8);
-        s += pitch;
-        vst1_u8(s, d4u8);
-        s += pitch;
-        vst1_u8(s, d5u8);
-    }
-    return;
+void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s, *psrc;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  uint8x8_t d16u8, d17u8, d18u8;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  psrc = src - (pitch << 2);
+  for (i = 0; i < 1; i++) {
+    s = psrc + i * 8;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+                       &d5u8);
+
+    s -= (pitch * 6);
+    vst1_u8(s, d0u8);
+    s += pitch;
+    vst1_u8(s, d1u8);
+    s += pitch;
+    vst1_u8(s, d2u8);
+    s += pitch;
+    vst1_u8(s, d3u8);
+    s += pitch;
+    vst1_u8(s, d4u8);
+    s += pitch;
+    vst1_u8(s, d5u8);
+  }
+  return;
 }
 
-void vpx_lpf_vertical_8_neon(
-        uint8_t *src,
-        int pitch,
-        const uint8_t *blimit,
-        const uint8_t *limit,
-        const uint8_t *thresh) {
-    int i;
-    uint8_t *s;
-    uint8x8_t dblimit, dlimit, dthresh;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint8x8_t d16u8, d17u8, d18u8;
-    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-    uint8x8x4_t d4Result;
-    uint8x8x2_t d2Result;
-
-    dblimit = vld1_u8(blimit);
-    dlimit = vld1_u8(limit);
-    dthresh = vld1_u8(thresh);
-
-    for (i = 0; i < 1; i++) {
-        s = src + (i * (pitch << 3)) - 4;
-
-        d3u8 = vld1_u8(s);
-        s += pitch;
-        d4u8 = vld1_u8(s);
-        s += pitch;
-        d5u8 = vld1_u8(s);
-        s += pitch;
-        d6u8 = vld1_u8(s);
-        s += pitch;
-        d7u8 = vld1_u8(s);
-        s += pitch;
-        d16u8 = vld1_u8(s);
-        s += pitch;
-        d17u8 = vld1_u8(s);
-        s += pitch;
-        d18u8 = vld1_u8(s);
-
-        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
-                          vreinterpret_u32_u8(d7u8));
-        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
-                          vreinterpret_u32_u8(d16u8));
-        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
-                          vreinterpret_u32_u8(d17u8));
-        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
-                          vreinterpret_u32_u8(d18u8));
-
-        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                          vreinterpret_u16_u32(d2tmp2.val[0]));
-        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                          vreinterpret_u16_u32(d2tmp3.val[0]));
-        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                          vreinterpret_u16_u32(d2tmp2.val[1]));
-        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                          vreinterpret_u16_u32(d2tmp3.val[1]));
-
-        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                         vreinterpret_u8_u16(d2tmp5.val[0]));
-        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                         vreinterpret_u8_u16(d2tmp5.val[1]));
-        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                          vreinterpret_u8_u16(d2tmp7.val[0]));
-        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                          vreinterpret_u8_u16(d2tmp7.val[1]));
-
-        d3u8 = d2tmp8.val[0];
-        d4u8 = d2tmp8.val[1];
-        d5u8 = d2tmp9.val[0];
-        d6u8 = d2tmp9.val[1];
-        d7u8 = d2tmp10.val[0];
-        d16u8 = d2tmp10.val[1];
-        d17u8 = d2tmp11.val[0];
-        d18u8 = d2tmp11.val[1];
-
-        mbloop_filter_neon(dblimit, dlimit, dthresh,
-                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
-        d4Result.val[0] = d0u8;
-        d4Result.val[1] = d1u8;
-        d4Result.val[2] = d2u8;
-        d4Result.val[3] = d3u8;
-
-        d2Result.val[0] = d4u8;
-        d2Result.val[1] = d5u8;
-
-        s = src - 3;
-        vst4_lane_u8(s, d4Result, 0);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 1);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 2);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 3);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 4);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 5);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 6);
-        s += pitch;
-        vst4_lane_u8(s, d4Result, 7);
-
-        s = src + 1;
-        vst2_lane_u8(s, d2Result, 0);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 1);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 2);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 3);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 4);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 5);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 6);
-        s += pitch;
-        vst2_lane_u8(s, d2Result, 7);
-    }
-    return;
+void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  uint8x8_t d16u8, d17u8, d18u8;
+  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+  uint8x8x4_t d4Result;
+  uint8x8x2_t d2Result;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  for (i = 0; i < 1; i++) {
+    s = src + (i * (pitch << 3)) - 4;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                      vreinterpret_u16_u32(d2tmp2.val[0]));
+    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                      vreinterpret_u16_u32(d2tmp3.val[0]));
+    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                      vreinterpret_u16_u32(d2tmp2.val[1]));
+    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                      vreinterpret_u16_u32(d2tmp3.val[1]));
+
+    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                     vreinterpret_u8_u16(d2tmp5.val[0]));
+    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                     vreinterpret_u8_u16(d2tmp5.val[1]));
+    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                      vreinterpret_u8_u16(d2tmp7.val[0]));
+    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                      vreinterpret_u8_u16(d2tmp7.val[1]));
+
+    d3u8 = d2tmp8.val[0];
+    d4u8 = d2tmp8.val[1];
+    d5u8 = d2tmp9.val[0];
+    d6u8 = d2tmp9.val[1];
+    d7u8 = d2tmp10.val[0];
+    d16u8 = d2tmp10.val[1];
+    d17u8 = d2tmp11.val[0];
+    d18u8 = d2tmp11.val[1];
+
+    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+                       &d5u8);
+
+    d4Result.val[0] = d0u8;
+    d4Result.val[1] = d1u8;
+    d4Result.val[2] = d2u8;
+    d4Result.val[3] = d3u8;
+
+    d2Result.val[0] = d4u8;
+    d2Result.val[1] = d5u8;
+
+    s = src - 3;
+    vst4_lane_u8(s, d4Result, 0);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 1);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 2);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 3);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 4);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 5);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 6);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 7);
+
+    s = src + 1;
+    vst2_lane_u8(s, d2Result, 0);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 1);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 2);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 3);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 4);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 5);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 6);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 7);
+  }
+  return;
 }
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index aa31f293588cab5e67e8a131f73091ec776352a1..9129b5d2d5596d8595315cc016aa08d82ba6761d 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -14,42 +14,32 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
   vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
   vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 #if HAVE_NEON_ASM
-void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
-                                    const uint8_t *blimit0,
-                                    const uint8_t *limit0,
-                                    const uint8_t *thresh0,
-                                    const uint8_t *blimit1,
-                                    const uint8_t *limit1,
-                                    const uint8_t *thresh1) {
+void vpx_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
   vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
   vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
 }
 
-void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
   vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
   vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
-void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
-                                   const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit,
                                    const uint8_t *thresh) {
   vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index c7704dc1be67266995b6e117adba5248f53307b3..dc20398000aaa4daffb16ef65a9c0c7863dd7c78 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -16,10 +16,10 @@
 
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                                     const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
-                                        vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
-                                        vget_high_u16(vec_hi));
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
   const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
   const uint64x2_t b = vpaddlq_u32(a);
   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@@ -33,8 +33,7 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
 static void sad_neon_64(const uint8x16_t vec_src_00,
                         const uint8x16_t vec_src_16,
                         const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48,
-                        const uint8_t *ref,
+                        const uint8x16_t vec_src_48, const uint8_t *ref,
                         uint16x8_t *vec_sum_ref_lo,
                         uint16x8_t *vec_sum_ref_hi) {
   const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@@ -63,8 +62,7 @@ static void sad_neon_64(const uint8x16_t vec_src_00,
 // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
 // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
 static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
-                        const uint8_t *ref,
+                        const uint8x16_t vec_src_16, const uint8_t *ref,
                         uint16x8_t *vec_sum_ref_lo,
                         uint16x8_t *vec_sum_ref_hi) {
   const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@@ -81,7 +79,7 @@ static void sad_neon_32(const uint8x16_t vec_src_00,
 }
 
 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t* const ref[4], int ref_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
   int i;
   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -127,7 +125,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
 }
 
 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t* const ref[4], int ref_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
   int i;
   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -148,14 +146,14 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
     const uint8x16_t vec_src_00 = vld1q_u8(src);
     const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
 
-    sad_neon_32(vec_src_00, vec_src_16, ref0,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
+                &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
+                &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
+                &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
+                &vec_sum_ref3_hi);
 
     src += src_stride;
     ref0 += ref_stride;
@@ -171,7 +169,7 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
 }
 
 void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t* const ref[4], int ref_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
   int i;
   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -195,20 +193,20 @@ void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
     const uint8x16_t vec_ref2 = vld1q_u8(ref2);
     const uint8x16_t vec_ref3 = vld1q_u8(ref3);
 
-    vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
-                               vget_low_u8(vec_ref0));
+    vec_sum_ref0_lo =
+        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
     vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
                                vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
-                               vget_low_u8(vec_ref1));
+    vec_sum_ref1_lo =
+        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
     vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
                                vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
-                               vget_low_u8(vec_ref2));
+    vec_sum_ref2_lo =
+        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
     vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
                                vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
-                               vget_low_u8(vec_ref3));
+    vec_sum_ref3_lo =
+        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
     vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
                                vget_high_u8(vec_ref3));
 
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index 173f08ac3c3e202764a8dd01a43a9b8877d08289..ff3228768cefc8455f65267982b881c9c4ebf7d2 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -14,114 +14,105 @@
 
 #include "vpx/vpx_integer.h"
 
-unsigned int vpx_sad8x16_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
+unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
 
+  for (i = 0; i < 15; i++) {
     d0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
     d8 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 15; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
 }
 
-unsigned int vpx_sad4x4_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x2_t d1;
-    uint64x1_t d3;
-    int i;
+unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
+                             unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x2_t d1;
+  uint64x1_t d3;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
 
+  for (i = 0; i < 3; i++) {
     d0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
     d8 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 3; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
+    q12 = vabal_u8(q12, d0, d8);
+  }
 
-    d1 = vpaddl_u16(vget_low_u16(q12));
-    d3 = vpaddl_u32(d1);
+  d1 = vpaddl_u16(vget_low_u16(q12));
+  d3 = vpaddl_u32(d1);
 
-    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
 }
 
-unsigned int vpx_sad16x8_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x16_t q0, q4;
-    uint16x8_t q12, q13;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
+unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x16_t q0, q4;
+  uint16x8_t q12, q13;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  q0 = vld1q_u8(src_ptr);
+  src_ptr += src_stride;
+  q4 = vld1q_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
 
+  for (i = 0; i < 7; i++) {
     q0 = vld1q_u8(src_ptr);
     src_ptr += src_stride;
     q4 = vld1q_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-    for (i = 0; i < 7; i++) {
-        q0 = vld1q_u8(src_ptr);
-        src_ptr += src_stride;
-        q4 = vld1q_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-    }
-
-    q12 = vaddq_u16(q12, q13);
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
+    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+  }
+
+  q12 = vaddq_u16(q12, q13);
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
 }
 
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                                     const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
-                                        vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
-                                        vget_high_u16(vec_hi));
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
   const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
   const uint64x2_t b = vpaddlq_u32(a);
   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@@ -208,10 +199,10 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
     const uint8x16_t vec_ref = vld1q_u8(ref);
     src += src_stride;
     ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
-                            vget_low_u8(vec_ref));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
-                            vget_high_u8(vec_ref));
+    vec_accum_lo =
+        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
+    vec_accum_hi =
+        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
   }
   return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
diff --git a/vpx_dsp/arm/subpel_variance_media.c b/vpx_dsp/arm/subpel_variance_media.c
index e7d8c85fb510028d6fde9330fd589d0841a95e48..ab53361579df363db0dfd5464aee8d860b06dc07 100644
--- a/vpx_dsp/arm/subpel_variance_media.c
+++ b/vpx_dsp/arm/subpel_variance_media.c
@@ -14,91 +14,66 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_MEDIA
-static const int16_t bilinear_filters_media[8][2] = {
-  { 128,   0 },
-  { 112,  16 },
-  {  96,  32 },
-  {  80,  48 },
-  {  64,  64 },
-  {  48,  80 },
-  {  32,  96 },
-  {  16, 112 }
-};
+static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
+                                                      { 96, 32 }, { 80, 48 },
+                                                      { 64, 64 }, { 48, 80 },
+                                                      { 32, 96 }, { 16, 112 } };
 
-extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
-                                                    uint16_t *dst_ptr,
-                                                    uint32_t src_pitch,
-                                                    uint32_t height,
-                                                    uint32_t width,
-                                                    const int16_t *filter);
+extern void vpx_filter_block2d_bil_first_pass_media(
+    const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
+    uint32_t height, uint32_t width, const int16_t *filter);
 
-extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
-                                                     uint8_t *dst_ptr,
-                                                     int32_t src_pitch,
-                                                     uint32_t height,
-                                                     uint32_t width,
-                                                     const int16_t *filter);
+extern void vpx_filter_block2d_bil_second_pass_media(
+    const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
+    uint32_t height, uint32_t width, const int16_t *filter);
 
-
-unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
-                                             int src_pixels_per_line,
-                                             int xoffset, int yoffset,
-                                             const uint8_t *dst_ptr,
-                                             int dst_pixels_per_line,
-                                             unsigned int *sse) {
-  uint16_t first_pass[10*8];
-  uint8_t  second_pass[8*8];
+unsigned int vpx_sub_pixel_variance8x8_media(
+    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+  uint16_t first_pass[10 * 8];
+  uint8_t second_pass[8 * 8];
   const int16_t *HFilter, *VFilter;
 
   HFilter = bilinear_filters_media[xoffset];
   VFilter = bilinear_filters_media[yoffset];
 
   vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
-                                          src_pixels_per_line,
-                                          9, 8, HFilter);
-  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
-                                           8, 8, 8, VFilter);
+                                          src_pixels_per_line, 9, 8, HFilter);
+  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
+                                           VFilter);
 
-  return vpx_variance8x8_media(second_pass, 8, dst_ptr,
-                               dst_pixels_per_line, sse);
+  return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
+                               sse);
 }
 
-unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
-                                               int src_pixels_per_line,
-                                               int xoffset,
-                                               int yoffset,
-                                               const uint8_t *dst_ptr,
-                                               int dst_pixels_per_line,
-                                               unsigned int *sse) {
-  uint16_t first_pass[36*16];
-  uint8_t  second_pass[20*16];
+unsigned int vpx_sub_pixel_variance16x16_media(
+    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+  uint16_t first_pass[36 * 16];
+  uint8_t second_pass[20 * 16];
   const int16_t *HFilter, *VFilter;
   unsigned int var;
 
   if (xoffset == 4 && yoffset == 0) {
-    var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
-                                               dst_ptr, dst_pixels_per_line,
-                                               sse);
+    var = vpx_variance_halfpixvar16x16_h_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
   } else if (xoffset == 0 && yoffset == 4) {
-    var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
-                                               dst_ptr, dst_pixels_per_line,
-                                               sse);
+    var = vpx_variance_halfpixvar16x16_v_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
   } else if (xoffset == 4 && yoffset == 4) {
-    var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
-                                                dst_ptr, dst_pixels_per_line,
-                                                sse);
+    var = vpx_variance_halfpixvar16x16_hv_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
   } else {
     HFilter = bilinear_filters_media[xoffset];
     VFilter = bilinear_filters_media[yoffset];
 
-    vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
-                                            src_pixels_per_line,
-                                            17, 16, HFilter);
-    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
-                                             16, 16, 16, VFilter);
+    vpx_filter_block2d_bil_first_pass_media(
+        src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
+    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
+                                             16, VFilter);
 
-    var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
-                                  dst_pixels_per_line, sse);
+    var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
+                                  sse);
   }
   return var;
 }
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 40e2cc89b35d3e657bb1d025bf548091672138fa..f044e11a1553651086eb88f0266713e62e05de3f 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -18,14 +18,8 @@
 #include "vpx_dsp/variance.h"
 
 static const uint8_t bilinear_filters[8][2] = {
-  { 128,   0, },
-  { 112,  16, },
-  {  96,  32, },
-  {  80,  48, },
-  {  64,  64, },
-  {  48,  80, },
-  {  32,  96, },
-  {  16, 112, },
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -79,74 +73,61 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
   }
 }
 
-unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
-                                            int src_stride,
-                                            int xoffset,
-                                            int yoffset,
-                                            const uint8_t *dst,
-                                            int dst_stride,
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *dst, int dst_stride,
                                             unsigned int *sse) {
   DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
   DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
 
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
-                            9, 8,
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
-                            8, bilinear_filters[yoffset]);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
+                            bilinear_filters[yoffset]);
   return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
 
 unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
   DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
   DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
 
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
-                             17, 16,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
                              bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
-                             16, bilinear_filters[yoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
+                             bilinear_filters[yoffset]);
   return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }
 
 unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
   DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
   DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
 
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
-                             33, 32,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
                              bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
-                             32, bilinear_filters[yoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
+                             bilinear_filters[yoffset]);
   return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 }
 
 unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
-                                              int src_stride,
-                                              int xoffset,
-                                              int yoffset,
-                                              const uint8_t *dst,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
   DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
   DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
 
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
-                             65, 64,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
                              bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
-                             64, bilinear_filters[yoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
+                             bilinear_filters[yoffset]);
   return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index 7b146095ea2f5aed80e05a3bd7f52e1d73c62405..ce81fb630f248f2f2053a15befb1691333b6d083 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -13,10 +13,10 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_subtract_block_neon(int rows, int cols,
-                             int16_t *diff, ptrdiff_t diff_stride,
-                             const uint8_t *src, ptrdiff_t src_stride,
-                             const uint8_t *pred, ptrdiff_t pred_stride) {
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
+                             ptrdiff_t diff_stride, const uint8_t *src,
+                             ptrdiff_t src_stride, const uint8_t *pred,
+                             ptrdiff_t pred_stride) {
   int r, c;
 
   if (cols > 16) {
@@ -24,38 +24,38 @@ void vpx_subtract_block_neon(int rows, int cols,
       for (c = 0; c < cols; c += 32) {
         const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
         const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
-        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c +  0]);
+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
         const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
-        const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
-                                                 vget_low_u8(v_pred_00));
-        const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
-                                                 vget_high_u8(v_pred_00));
-        const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
-                                                 vget_low_u8(v_pred_16));
-        const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
-                                                 vget_high_u8(v_pred_16));
-        vst1q_s16(&diff[c +  0], vreinterpretq_s16_u16(v_diff_lo_00));
-        vst1q_s16(&diff[c +  8], vreinterpretq_s16_u16(v_diff_hi_00));
+        const uint16x8_t v_diff_lo_00 =
+            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+        const uint16x8_t v_diff_hi_00 =
+            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+        const uint16x8_t v_diff_lo_16 =
+            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+        const uint16x8_t v_diff_hi_16 =
+            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
         vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
         vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
       }
       diff += diff_stride;
       pred += pred_stride;
-      src  += src_stride;
+      src += src_stride;
     }
   } else if (cols > 8) {
     for (r = 0; r < rows; ++r) {
       const uint8x16_t v_src = vld1q_u8(&src[0]);
       const uint8x16_t v_pred = vld1q_u8(&pred[0]);
-      const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
-                                            vget_low_u8(v_pred));
-      const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
-                                            vget_high_u8(v_pred));
+      const uint16x8_t v_diff_lo =
+          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+      const uint16x8_t v_diff_hi =
+          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
       vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
       vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
       diff += diff_stride;
       pred += pred_stride;
-      src  += src_stride;
+      src += src_stride;
     }
   } else if (cols > 4) {
     for (r = 0; r < rows; ++r) {
@@ -65,16 +65,15 @@ void vpx_subtract_block_neon(int rows, int cols,
       vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
       diff += diff_stride;
       pred += pred_stride;
-      src  += src_stride;
+      src += src_stride;
     }
   } else {
     for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c)
-        diff[c] = src[c] - pred[c];
+      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
 
       diff += diff_stride;
       pred += pred_stride;
-      src  += src_stride;
+      src += src_stride;
     }
   }
 }
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index ede6e7bbb03b6915f6e19a1c383d86fd7477646c..f469afc4e4b8db4a93c6607a72678e315e3d3b2d 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -32,9 +32,9 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
 }
 
 // w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride,
-                             const uint8_t *b, int b_stride,
-                             int w, int h, uint32_t *sse, int *sum) {
+static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, uint32_t *sse,
+                             int *sum) {
   int i, j;
   int16x8_t v_sum = vdupq_n_s16(0);
   int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -47,12 +47,10 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
       const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
       const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
       v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo = vmlal_s16(v_sse_lo,
-                           vget_low_s16(sv_diff),
-                           vget_low_s16(sv_diff));
-      v_sse_hi = vmlal_s16(v_sse_hi,
-                           vget_high_s16(sv_diff),
-                           vget_high_s16(sv_diff));
+      v_sse_lo =
+          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+      v_sse_hi =
+          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
     }
     a += a_stride;
     b += b_stride;
@@ -62,15 +60,13 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
   *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
 }
 
-void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
-                        const uint8_t *b, int b_stride,
-                        unsigned int *sse, int *sum) {
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                        int b_stride, unsigned int *sse, int *sum) {
   variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
 }
 
-void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
-                          const uint8_t *b, int b_stride,
-                          unsigned int *sse, int *sum) {
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, unsigned int *sse, int *sum) {
   variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
 }
 
@@ -104,9 +100,8 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
   int sum1, sum2;
   uint32_t sse1, sse2;
   variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride,
-                   b + (32 * b_stride), b_stride, 32, 32,
-                   &sse2, &sum2);
+  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
+                   32, 32, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
@@ -118,9 +113,8 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
   int sum1, sum2;
   uint32_t sse1, sse2;
   variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride,
-                   b + (16 * b_stride), b_stride, 64, 16,
-                   &sse2, &sum2);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
@@ -133,286 +127,273 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
   uint32_t sse1, sse2;
 
   variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride,
-                   b + (16 * b_stride), b_stride, 64, 16,
-                   &sse2, &sum2);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
   sse1 += sse2;
   sum1 += sum2;
 
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
-                   b + (16 * 2 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
   sse1 += sse2;
   sum1 += sum2;
 
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
-                   b + (16 * 3 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 
-unsigned int vpx_variance16x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 4; i++) {
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
+unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 4; i++) {
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
 
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
 
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
 
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
 
-    return vget_lane_u32(d0u32, 0);
-}
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
 
-unsigned int vpx_variance8x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d2u8, d4u8, d6u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d2u8, d6u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-    }
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
 
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
+  return vget_lane_u32(d0u32, 0);
+}
 
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  uint8x8_t d0u8, d2u8, d4u8, d6u8;
+  int16x4_t d22s16, d23s16, d24s16, d25s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint16x8_t q11u16, q12u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
 
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
 
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d2u8, d6u8);
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+  }
 
-    return vget_lane_u32(d0u32, 0);
-}
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
 
-unsigned int vpx_mse16x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    int64x1_t d0s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    int32x4_t q7s32, q8s32, q9s32, q10s32;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int64x2_t q1s64;
-
-    q7s32 = vdupq_n_s32(0);
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
 
-    q7s32 = vaddq_s32(q7s32, q8s32);
-    q9s32 = vaddq_s32(q9s32, q10s32);
-    q10s32 = vaddq_s32(q7s32, q9s32);
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
 
-    q1s64 = vpaddlq_s32(q10s32);
-    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
 
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+  return vget_lane_u32(d0u32, 0);
 }
 
-unsigned int vpx_get4x4sse_cs_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride) {
-    int16x4_t d22s16, d24s16, d26s16, d28s16;
-    int64x1_t d0s64;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int32x4_t q7s32, q8s32, q9s32, q10s32;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int64x2_t q1s64;
-
-    d0u8 = vld1_u8(src_ptr);
+unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
+                               const unsigned char *ref_ptr, int recon_stride,
+                               unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  int64x1_t d0s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  q7s32 = vdupq_n_s32(0);
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+    q0u8 = vld1q_u8(src_ptr);
     src_ptr += source_stride;
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d1u8 = vld1_u8(src_ptr);
+    q1u8 = vld1q_u8(src_ptr);
     src_ptr += source_stride;
-    d5u8 = vld1_u8(ref_ptr);
+    q2u8 = vld1q_u8(ref_ptr);
     ref_ptr += recon_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d3u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d7u8 = vld1_u8(ref_ptr);
+    q3u8 = vld1q_u8(ref_ptr);
     ref_ptr += recon_stride;
 
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d1u8, d5u8);
-    q13u16 = vsubl_u8(d2u8, d6u8);
-    q14u16 = vsubl_u8(d3u8, d7u8);
-
-    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
 
-    q7s32 = vmull_s16(d22s16, d22s16);
-    q8s32 = vmull_s16(d24s16, d24s16);
-    q9s32 = vmull_s16(d26s16, d26s16);
-    q10s32 = vmull_s16(d28s16, d28s16);
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q10s32 = vaddq_s32(q7s32, q9s32);
 
-    q7s32 = vaddq_s32(q7s32, q8s32);
-    q9s32 = vaddq_s32(q9s32, q10s32);
-    q9s32 = vaddq_s32(q7s32, q9s32);
+  q1s64 = vpaddlq_s32(q10s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
 
-    q1s64 = vpaddlq_s32(q9s32);
-    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
 
-    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride) {
+  int16x4_t d22s16, d24s16, d26s16, d28s16;
+  int64x1_t d0s64;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  d0u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d4u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d1u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d5u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d2u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d6u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d3u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d7u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+
+  q11u16 = vsubl_u8(d0u8, d4u8);
+  q12u16 = vsubl_u8(d1u8, d5u8);
+  q13u16 = vsubl_u8(d2u8, d6u8);
+  q14u16 = vsubl_u8(d3u8, d7u8);
+
+  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+  q7s32 = vmull_s16(d22s16, d22s16);
+  q8s32 = vmull_s16(d24s16, d24s16);
+  q9s32 = vmull_s16(d26s16, d26s16);
+  q10s32 = vmull_s16(d28s16, d28s16);
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q9s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q9s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
diff --git a/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/vpx_dsp/arm/vpx_convolve8_avg_neon.c
index 8632250138c18b7f7ce86cac0892a76619857026..69cb28400538d1f8767c7dc863d764206158c701 100644
--- a/vpx_dsp/arm/vpx_convolve8_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon.c
@@ -16,16 +16,11 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-static INLINE int32x4_t MULTIPLY_BY_Q0(
-    int16x4_t dsrc0,
-    int16x4_t dsrc1,
-    int16x4_t dsrc2,
-    int16x4_t dsrc3,
-    int16x4_t dsrc4,
-    int16x4_t dsrc5,
-    int16x4_t dsrc6,
-    int16x4_t dsrc7,
-    int16x8_t q0s16) {
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
+                                       int16x8_t q0s16) {
   int32x4_t qdst;
   int16x4_t d0s16, d1s16;
 
@@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
   return qdst;
 }
 
-void vpx_convolve8_avg_horiz_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,
-    int x_step_q4,
-    const int16_t *filter_y,  // unused
-    int y_step_q4,            // unused
-    int w,
-    int h) {
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y,  // unused
+                                  int y_step_q4,            // unused
+                                  int w, int h) {
   int width;
   const uint8_t *s;
   uint8_t *d;
@@ -76,7 +66,7 @@ void vpx_convolve8_avg_horiz_neon(
 
   q0s16 = vld1q_s16(filter_x);
 
-  src -= 3;  // adjust for taps
+  src -= 3;                // adjust for taps
   for (; h > 0; h -= 4) {  // loop_horiz_v
     s = src;
     d24u8 = vld1_u8(s);
@@ -90,8 +80,8 @@ void vpx_convolve8_avg_horiz_neon(
     q12u8 = vcombine_u8(d24u8, d25u8);
     q13u8 = vcombine_u8(d26u8, d27u8);
 
-    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
-                        vreinterpretq_u16_u8(q13u8));
+    q0x2u16 =
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
     d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
     d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
     d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@@ -116,10 +106,8 @@ void vpx_convolve8_avg_horiz_neon(
     q9u16 = vcombine_u16(d17u16, d19u16);
 
     d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w;
-         width > 0;
-         width -= 4, src += 4, dst += 4) {  // loop_horiz
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
+    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
       s = src;
       d28u32 = vld1_dup_u32((const uint32_t *)s);
       s += src_stride;
@@ -131,10 +119,10 @@ void vpx_convolve8_avg_horiz_neon(
 
       __builtin_prefetch(src + 64);
 
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
-                         vreinterpret_u16_u32(d31u32));
-      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
-                         vreinterpret_u16_u32(d30u32));
+      d0x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+      d1x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
       d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                        vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
       d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
@@ -144,8 +132,8 @@ void vpx_convolve8_avg_horiz_neon(
 
       q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
       q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
-                          vreinterpretq_u32_u8(q15u8));
+      q0x2u32 =
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
 
       d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
       d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@@ -173,14 +161,14 @@ void vpx_convolve8_avg_horiz_neon(
       d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
       d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
 
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
-                              d18s16, d19s16, d23s16, d24s16, q0s16);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
-                              d19s16, d23s16, d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
-                              d23s16, d24s16, d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+                             d23s16, d24s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+                             d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
 
       __builtin_prefetch(src + 64 + src_stride * 3);
 
@@ -195,8 +183,7 @@ void vpx_convolve8_avg_horiz_neon(
       d2u8 = vqmovn_u16(q1u16);
       d3u8 = vqmovn_u16(q2u16);
 
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
-                         vreinterpret_u16_u8(d3u8));
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
       d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                          vreinterpret_u32_u16(d0x2u16.val[1]));
       d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@@ -231,17 +218,12 @@ void vpx_convolve8_avg_horiz_neon(
   return;
 }
 
-void vpx_convolve8_avg_vert_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,  // unused
-    int x_step_q4,            // unused
-    const int16_t *filter_y,
-    int y_step_q4,
-    int w,
-    int h) {
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x,  // unused
+                                 int x_step_q4,            // unused
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h) {
   int height;
   const uint8_t *s;
   uint8_t *d;
@@ -277,8 +259,8 @@ void vpx_convolve8_avg_vert_neon(
     d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
     s += src_stride;
 
-    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
     q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
     q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
 
@@ -319,20 +301,20 @@ void vpx_convolve8_avg_vert_neon(
 
       __builtin_prefetch(s);
       __builtin_prefetch(s + src_stride);
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
-                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+                             d22s16, d24s16, q0s16);
       __builtin_prefetch(s + src_stride * 2);
       __builtin_prefetch(s + src_stride * 3);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
-                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+                             d24s16, d26s16, q0s16);
       __builtin_prefetch(d);
       __builtin_prefetch(d + dst_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
-                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+                              d26s16, d27s16, q0s16);
       __builtin_prefetch(d + dst_stride * 2);
       __builtin_prefetch(d + dst_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
 
       d2u16 = vqrshrun_n_s32(q1s32, 7);
       d3u16 = vqrshrun_n_s32(q2s32, 7);
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index 9bd715e2c630b2f65adeeb18195f6eea9b2685d8..514525696b0dd99922a4e0e7ab67394de1644310 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -16,16 +16,11 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-static INLINE int32x4_t MULTIPLY_BY_Q0(
-    int16x4_t dsrc0,
-    int16x4_t dsrc1,
-    int16x4_t dsrc2,
-    int16x4_t dsrc3,
-    int16x4_t dsrc4,
-    int16x4_t dsrc5,
-    int16x4_t dsrc6,
-    int16x4_t dsrc7,
-    int16x8_t q0s16) {
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
+                                       int16x8_t q0s16) {
   int32x4_t qdst;
   int16x4_t d0s16, d1s16;
 
@@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
   return qdst;
 }
 
-void vpx_convolve8_horiz_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,
-    int x_step_q4,
-    const int16_t *filter_y,  // unused
-    int y_step_q4,            // unused
-    int w,
-    int h) {
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y,  // unused
+                              int y_step_q4,            // unused
+                              int w, int h) {
   int width;
   const uint8_t *s, *psrc;
   uint8_t *d, *pdst;
@@ -77,9 +67,8 @@ void vpx_convolve8_horiz_neon(
   q0s16 = vld1q_s16(filter_x);
 
   src -= 3;  // adjust for taps
-  for (; h > 0; h -= 4,
-    src += src_stride * 4,
-    dst += dst_stride * 4) {  // loop_horiz_v
+  for (; h > 0; h -= 4, src += src_stride * 4,
+                dst += dst_stride * 4) {  // loop_horiz_v
     s = src;
     d24u8 = vld1_u8(s);
     s += src_stride;
@@ -92,8 +81,8 @@ void vpx_convolve8_horiz_neon(
     q12u8 = vcombine_u8(d24u8, d25u8);
     q13u8 = vcombine_u8(d26u8, d27u8);
 
-    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
-                        vreinterpretq_u16_u8(q13u8));
+    q0x2u16 =
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
     d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
     d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
     d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@@ -105,8 +94,8 @@ void vpx_convolve8_horiz_neon(
     __builtin_prefetch(src + src_stride * 5);
     __builtin_prefetch(src + src_stride * 6);
 
-    q8u16  = vmovl_u8(d0x2u8.val[0]);
-    q9u16  = vmovl_u8(d0x2u8.val[1]);
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
     q10u16 = vmovl_u8(d1x2u8.val[0]);
     q11u16 = vmovl_u8(d1x2u8.val[1]);
 
@@ -119,8 +108,7 @@ void vpx_convolve8_horiz_neon(
 
     d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
     d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w, psrc = src + 7, pdst = dst;
-         width > 0;
+    for (width = w, psrc = src + 7, pdst = dst; width > 0;
          width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
       s = psrc;
       d28u32 = vld1_dup_u32((const uint32_t *)s);
@@ -133,10 +121,10 @@ void vpx_convolve8_horiz_neon(
 
       __builtin_prefetch(psrc + 64);
 
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
-                         vreinterpret_u16_u32(d31u32));
-      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
-                         vreinterpret_u16_u32(d30u32));
+      d0x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+      d1x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
       d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                        vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
       d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
@@ -146,8 +134,8 @@ void vpx_convolve8_horiz_neon(
 
       q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
       q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
-                          vreinterpretq_u32_u8(q15u8));
+      q0x2u32 =
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
 
       d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
       d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@@ -166,14 +154,14 @@ void vpx_convolve8_horiz_neon(
       d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
       d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
 
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
-                              d18s16, d19s16, d23s16, d24s16, q0s16);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
-                              d19s16, d23s16, d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
-                              d23s16, d24s16, d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+                             d23s16, d24s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+                             d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
 
       __builtin_prefetch(psrc + 60 + src_stride * 3);
 
@@ -188,8 +176,7 @@ void vpx_convolve8_horiz_neon(
       d2u8 = vqmovn_u16(q1u16);
       d3u8 = vqmovn_u16(q2u16);
 
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
-                         vreinterpret_u16_u8(d3u8));
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
       d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                          vreinterpret_u32_u16(d0x2u16.val[1]));
       d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@@ -217,17 +204,12 @@ void vpx_convolve8_horiz_neon(
   return;
 }
 
-void vpx_convolve8_vert_neon(
-    const uint8_t *src,
-    ptrdiff_t src_stride,
-    uint8_t *dst,
-    ptrdiff_t dst_stride,
-    const int16_t *filter_x,  // unused
-    int x_step_q4,            // unused
-    const int16_t *filter_y,
-    int y_step_q4,
-    int w,
-    int h) {
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,  // unused
+                             int x_step_q4,            // unused
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
   int height;
   const uint8_t *s;
   uint8_t *d;
@@ -261,8 +243,8 @@ void vpx_convolve8_vert_neon(
     d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
     s += src_stride;
 
-    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
     q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
     q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
 
@@ -294,20 +276,20 @@ void vpx_convolve8_vert_neon(
 
       __builtin_prefetch(d);
       __builtin_prefetch(d + dst_stride);
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
-                              d20s16, d21s16, d22s16, d24s16, q0s16);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+                             d22s16, d24s16, q0s16);
       __builtin_prefetch(d + dst_stride * 2);
       __builtin_prefetch(d + dst_stride * 3);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
-                              d21s16, d22s16, d24s16, d26s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+                             d24s16, d26s16, q0s16);
       __builtin_prefetch(s);
       __builtin_prefetch(s + src_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
-                              d22s16, d24s16, d26s16, d27s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+                              d26s16, d27s16, q0s16);
       __builtin_prefetch(s + src_stride * 2);
       __builtin_prefetch(s + src_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
 
       d2u16 = vqrshrun_n_s32(q1s32, 7);
       d3u16 = vqrshrun_n_s32(q2s32, 7);
diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c
index dc58a332f81d147acc3e9b60f19ac8de32347f9b..abc2511ea291f34c26710c9eae7d3a24138c2940 100644
--- a/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -13,34 +13,32 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_convolve_avg_neon(
-    const uint8_t *src,    // r0
-    ptrdiff_t src_stride,  // r1
-    uint8_t *dst,          // r2
-    ptrdiff_t dst_stride,  // r3
-    const int16_t *filter_x,
-    int filter_x_stride,
-    const int16_t *filter_y,
-    int filter_y_stride,
-    int w,
-    int h) {
+void vpx_convolve_avg_neon(const uint8_t *src,    // r0
+                           ptrdiff_t src_stride,  // r1
+                           uint8_t *dst,          // r2
+                           ptrdiff_t dst_stride,  // r3
+                           const int16_t *filter_x, int filter_x_stride,
+                           const int16_t *filter_y, int filter_y_stride, int w,
+                           int h) {
   uint8_t *d;
   uint8x8_t d0u8, d1u8, d2u8, d3u8;
   uint32x2_t d0u32, d2u32;
   uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   d = dst;
   if (w > 32) {  // avg64
     for (; h > 0; h -= 1) {
-      q0u8  = vld1q_u8(src);
-      q1u8  = vld1q_u8(src + 16);
-      q2u8  = vld1q_u8(src + 32);
-      q3u8  = vld1q_u8(src + 48);
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
       src += src_stride;
-      q8u8  = vld1q_u8(d);
-      q9u8  = vld1q_u8(d + 16);
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
       q10u8 = vld1q_u8(d + 32);
       q11u8 = vld1q_u8(d + 48);
       d += dst_stride;
@@ -133,8 +131,7 @@ void vpx_convolve_avg_neon(
       d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
       d += dst_stride;
 
-      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
-                       vreinterpret_u8_u32(d2u32));
+      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
 
       d0u32 = vreinterpret_u32_u8(d0u8);
       vst1_lane_u32((uint32_t *)dst, d0u32, 0);
diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c
index d8fb97a861907cc834765d3e259e3d570a34770a..fec189e0e4267ed9fe3f257b59c61b3c7ac2c85d 100644
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -13,21 +13,19 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_convolve_copy_neon(
-    const uint8_t *src,    // r0
-    ptrdiff_t src_stride,  // r1
-    uint8_t *dst,          // r2
-    ptrdiff_t dst_stride,  // r3
-    const int16_t *filter_x,
-    int filter_x_stride,
-    const int16_t *filter_y,
-    int filter_y_stride,
-    int w,
-    int h) {
+void vpx_convolve_copy_neon(const uint8_t *src,    // r0
+                            ptrdiff_t src_stride,  // r1
+                            uint8_t *dst,          // r2
+                            ptrdiff_t dst_stride,  // r3
+                            const int16_t *filter_x, int filter_x_stride,
+                            const int16_t *filter_y, int filter_y_stride, int w,
+                            int h) {
   uint8x8_t d0u8, d2u8;
   uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   if (w > 32) {  // copy64
     for (; h > 0; h--) {
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 1506ce6203de21ade9449453b47c94237cfa608b..c2d5895b718bd02e8ff91cc45ff62efc7ad46f17 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -14,10 +14,9 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
+                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
   /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
@@ -35,23 +34,20 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
    * the temp buffer which has lots of extra room and is subsequently discarded
    * this is safe if somewhat less than ideal.
    */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w,
+                           intermediate_height);
 
   /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + 64 * 3, 64,
-                          dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
+  vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                          x_step_q4, filter_y, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
   DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
   int intermediate_height = h + 7;
 
@@ -61,12 +57,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* This implementation has the same issues as above. In addition, we only want
    * to average the values after both passes.
    */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
-                           temp, 64,
-                           filter_x, x_step_q4, filter_y, y_step_q4,
-                           w, intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + 64 * 3,
-                              64, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w,
+                           intermediate_height);
+  vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
 }
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index cf7fd36665e04ddd109ddc9a06e77b32dc0b642b..b0c5e9831d85ae6d11b3e233ba6ccfb7f84b5414 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -16,7 +16,8 @@ unsigned int vpx_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
   for (i = 0; i < 8; ++i, src += stride)
-    for (j = 0; j < 8; sum += src[j], ++j) {}
+    for (j = 0; j < 8; sum += src[j], ++j) {
+    }
 
   return ROUND_POWER_OF_TWO(sum, 6);
 }
@@ -25,7 +26,8 @@ unsigned int vpx_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
   for (i = 0; i < 4; ++i, src += stride)
-    for (j = 0; j < 4; sum += src[j], ++j) {}
+    for (j = 0; j < 4; sum += src[j], ++j) {
+    }
 
   return ROUND_POWER_OF_TWO(sum, 4);
 }
@@ -80,8 +82,8 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
   for (idx = 0; idx < 8; ++idx) {
     hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
                                        // dynamic range [-2040, 2040]
-    coeff += 8;  // coeff: 15 bit
-                 // dynamic range [-16320, 16320]
+    coeff += 8;                        // coeff: 15 bit
+                                       // dynamic range [-16320, 16320]
     ++tmp_buf;
   }
 }
@@ -92,8 +94,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
-                                + (idx & 0x01) * 8;
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
     vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
   }
 
@@ -109,8 +111,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
     int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
     int16_t b3 = (a2 - a3) >> 1;
 
-    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[64]  = b1 + b3;
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64] = b1 + b3;
     coeff[128] = b0 - b2;
     coeff[192] = b1 - b3;
 
@@ -123,8 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
 int vpx_satd_c(const int16_t *coeff, int length) {
   int i;
   int satd = 0;
-  for (i = 0; i < length; ++i)
-    satd += abs(coeff[i]);
+  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
 
   // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
   return satd;
@@ -140,8 +141,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
     int i;
     hbuf[idx] = 0;
     // hbuf[idx]: 14 bit, dynamic range [0, 16320].
-    for (i = 0; i < height; ++i)
-      hbuf[idx] += ref[i * ref_stride];
+    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
     // hbuf[idx]: 9 bit, dynamic range [0, 510].
     hbuf[idx] /= norm_factor;
     ++ref;
@@ -153,16 +153,14 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
   int idx;
   int16_t sum = 0;
   // sum: 14 bit, dynamic range [0, 16320]
-  for (idx = 0; idx < width; ++idx)
-    sum += ref[idx];
+  for (idx = 0; idx < width; ++idx) sum += ref[idx];
   return sum;
 }
 
 // ref: [0 - 510]
 // src: [0 - 510]
 // bwl: {2, 3, 4}
-int vpx_vector_var_c(const int16_t *ref, const int16_t *src,
-                     const int bwl) {
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
   int i;
   int width = 4 << bwl;
   int sse = 0, mean = 0, var;
@@ -178,15 +176,14 @@ int vpx_vector_var_c(const int16_t *ref, const int16_t *src,
   return var;
 }
 
-void vpx_minmax_8x8_c(const uint8_t *src, int src_stride,
-                      const uint8_t *ref, int ref_stride,
-                      int *min, int *max) {
+void vpx_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
+                      int ref_stride, int *min, int *max) {
   int i, j;
   *min = 255;
   *max = 0;
   for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
     for (j = 0; j < 8; ++j) {
-      int diff = abs(src[j]-ref[j]);
+      int diff = abs(src[j] - ref[j]);
       *min = diff < *min ? diff : *min;
       *max = diff > *max ? diff : *max;
     }
@@ -197,9 +194,10 @@ void vpx_minmax_8x8_c(const uint8_t *src, int src_stride,
 unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
   for (i = 0; i < 8; ++i, s += stride)
-    for (j = 0; j < 8; sum += s[j], ++j) {}
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
 
   return ROUND_POWER_OF_TWO(sum, 6);
 }
@@ -207,9 +205,10 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) {
 unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
-  for (i = 0; i < 4; ++i, s+=stride)
-    for (j = 0; j < 4; sum += s[j], ++j) {}
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 4; ++i, s += stride)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
 
   return ROUND_POWER_OF_TWO(sum, 4);
 }
@@ -217,18 +216,16 @@ unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) {
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
                              int dp, int *min, int *max) {
   int i, j;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
   *min = 255;
   *max = 0;
   for (i = 0; i < 8; ++i, s += p, d += dp) {
     for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j]-d[j]);
+      int diff = abs(s[j] - d[j]);
       *min = diff < *min ? diff : *min;
       *max = diff > *max ? diff : *max;
     }
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-
diff --git a/vpx_dsp/bitreader.c b/vpx_dsp/bitreader.c
index 8140e78e70e86ecd110fa39c9d2cbd3a45ac5a7b..90cbbba53f47ae8638f61ca44c9d39eaf9cd893d 100644
--- a/vpx_dsp/bitreader.c
+++ b/vpx_dsp/bitreader.c
@@ -18,11 +18,8 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_util/endian_inl.h"
 
-int vpx_reader_init(vpx_reader *r,
-                    const uint8_t *buffer,
-                    size_t size,
-                    vpx_decrypt_cb decrypt_cb,
-                    void *decrypt_state) {
+int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
+                    vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
   if (size && !buffer) {
     return 1;
   } else {
@@ -55,19 +52,19 @@ void vpx_reader_fill(vpx_reader *r) {
     buffer_start = r->clear_buffer;
   }
   if (bits_left > BD_VALUE_SIZE) {
-      const int bits = (shift & 0xfffffff8) + CHAR_BIT;
-      BD_VALUE nv;
-      BD_VALUE big_endian_values;
-      memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+    const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+    BD_VALUE nv;
+    BD_VALUE big_endian_values;
+    memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
 #if SIZE_MAX == 0xffffffffffffffffULL
-        big_endian_values = HToBE64(big_endian_values);
+    big_endian_values = HToBE64(big_endian_values);
 #else
-        big_endian_values = HToBE32(big_endian_values);
+    big_endian_values = HToBE32(big_endian_values);
 #endif
-      nv = big_endian_values >> (BD_VALUE_SIZE - bits);
-      count += bits;
-      buffer += (bits >> 3);
-      value = r->value | (nv << (shift & 0x7));
+    nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+    count += bits;
+    buffer += (bits >> 3);
+    value = r->value | (nv << (shift & 0x7));
   } else {
     const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
     int loop_end = 0;
diff --git a/vpx_dsp/bitreader.h b/vpx_dsp/bitreader.h
index 9a441b41077e6b4100b746168dddef7bcd74cbf6..6ee2a58632c5e4c5a2c7574f9cc430be74aee37a 100644
--- a/vpx_dsp/bitreader.h
+++ b/vpx_dsp/bitreader.h
@@ -45,11 +45,8 @@ typedef struct {
   uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
 } vpx_reader;
 
-int vpx_reader_init(vpx_reader *r,
-                    const uint8_t *buffer,
-                    size_t size,
-                    vpx_decrypt_cb decrypt_cb,
-                    void *decrypt_state);
+int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size,
+                    vpx_decrypt_cb decrypt_cb, void *decrypt_state);
 
 void vpx_reader_fill(vpx_reader *r);
 
@@ -81,8 +78,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
   unsigned int range;
   unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
 
-  if (r->count < 0)
-    vpx_reader_fill(r);
+  if (r->count < 0) vpx_reader_fill(r);
 
   value = r->value;
   count = r->count;
@@ -117,8 +113,7 @@ static INLINE int vpx_read_bit(vpx_reader *r) {
 static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
   int literal = 0, bit;
 
-  for (bit = bits - 1; bit >= 0; bit--)
-    literal |= vpx_read_bit(r) << bit;
+  for (bit = bits - 1; bit >= 0; bit--) literal |= vpx_read_bit(r) << bit;
 
   return literal;
 }
@@ -127,8 +122,7 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
                                 const vpx_prob *probs) {
   vpx_tree_index i = 0;
 
-  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
-    continue;
+  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) continue;
 
   return -i;
 }
diff --git a/vpx_dsp/bitreader_buffer.c b/vpx_dsp/bitreader_buffer.c
index 595b9bb1233b75fb0598f6d11dac7965358fef2b..bf88119a948c149a4babc9211388db1a3160b804 100644
--- a/vpx_dsp/bitreader_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c
@@ -30,20 +30,17 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
 
 int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
   int value = 0, bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    value |= vpx_rb_read_bit(rb) << bit;
+  for (bit = bits - 1; bit >= 0; bit--) value |= vpx_rb_read_bit(rb) << bit;
   return value;
 }
 
-int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
-                               int bits) {
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
   const int value = vpx_rb_read_literal(rb, bits);
   return vpx_rb_read_bit(rb) ? -value : value;
 }
 
-int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
-                                   int bits) {
+int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
   const int nbits = sizeof(unsigned) * 8 - bits - 1;
   const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
-  return ((int) value) >> nbits;
+  return ((int)value) >> nbits;
 }
diff --git a/vpx_dsp/bitwriter.c b/vpx_dsp/bitwriter.c
index 5b232e346e22a7214add60f5a9d21d760b62585b..81e28b309f573e2cabb1b6c29f9324655eacca86 100644
--- a/vpx_dsp/bitwriter.c
+++ b/vpx_dsp/bitwriter.c
@@ -14,21 +14,18 @@
 
 void vpx_start_encode(vpx_writer *br, uint8_t *source) {
   br->lowvalue = 0;
-  br->range    = 255;
-  br->count    = -24;
-  br->buffer   = source;
-  br->pos      = 0;
+  br->range = 255;
+  br->count = -24;
+  br->buffer = source;
+  br->pos = 0;
   vpx_write_bit(br, 0);
 }
 
 void vpx_stop_encode(vpx_writer *br) {
   int i;
 
-  for (i = 0; i < 32; i++)
-    vpx_write_bit(br, 0);
+  for (i = 0; i < 32; i++) vpx_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
-  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
-    br->buffer[br->pos++] = 0;
+  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
 }
-
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h
index d904997af309ffcd36c0295c2d21c551a7cc35cb..41040cf93549829d36297293d1a655a6b52ac09d 100644
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -85,8 +85,7 @@ static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
 static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
   int bit;
 
-  for (bit = bits - 1; bit >= 0; bit--)
-    vpx_write_bit(w, 1 & (data >> bit));
+  for (bit = bits - 1; bit >= 0; bit--) vpx_write_bit(w, 1 & (data >> bit));
 }
 
 #define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
diff --git a/vpx_dsp/bitwriter_buffer.c b/vpx_dsp/bitwriter_buffer.c
index 8633372da3478af419c749f74473e0ebddaa2595..0638622911aeee4fea21e65c2d944c86c3da4098 100644
--- a/vpx_dsp/bitwriter_buffer.c
+++ b/vpx_dsp/bitwriter_buffer.c
@@ -22,7 +22,7 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
   const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
-  if (q == CHAR_BIT -1) {
+  if (q == CHAR_BIT - 1) {
     wb->bit_buffer[p] = bit << q;
   } else {
     wb->bit_buffer[p] &= ~(1 << q);
@@ -33,11 +33,10 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
 
 void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
   int bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    vpx_wb_write_bit(wb, (data >> bit) & 1);
+  for (bit = bits - 1; bit >= 0; bit--) vpx_wb_write_bit(wb, (data >> bit) & 1);
 }
 
-void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb,
-                                     int data, int bits) {
+void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
+                                     int bits) {
   vpx_wb_write_literal(wb, data, bits + 1);
 }
diff --git a/vpx_dsp/blend.h b/vpx_dsp/blend.h
index 109183acc41bc49bdffbc03aed1da8eb0881be3e..2ceb4c78f854eb47a4a20c68dbcbc122ffb93c5a 100644
--- a/vpx_dsp/blend.h
+++ b/vpx_dsp/blend.h
@@ -18,23 +18,23 @@
 
 // Alpha blending with alpha values from the range [0, 64], where 64
 // means use the first input and 0 means use the second input.
-#define VPX_BLEND_A64_ROUND_BITS  6
-#define VPX_BLEND_A64_MAX_ALPHA   (1 << VPX_BLEND_A64_ROUND_BITS)   // 64
+#define VPX_BLEND_A64_ROUND_BITS 6
+#define VPX_BLEND_A64_MAX_ALPHA (1 << VPX_BLEND_A64_ROUND_BITS)  // 64
 
-#define VPX_BLEND_A64(a, v0, v1)                                              \
-  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1),     \
+#define VPX_BLEND_A64(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
                      VPX_BLEND_A64_ROUND_BITS)
 
 // Alpha blending with alpha values from the range [0, 256], where 256
 // means use the first input and 0 means use the second input.
 #define VPX_BLEND_A256_ROUND_BITS 8
-#define VPX_BLEND_A256_MAX_ALPHA  (1 << VPX_BLEND_A256_ROUND_BITS)  // 256
+#define VPX_BLEND_A256_MAX_ALPHA (1 << VPX_BLEND_A256_ROUND_BITS)  // 256
 
-#define VPX_BLEND_A256(a, v0, v1)                                             \
-  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1),    \
+#define VPX_BLEND_A256(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
                      VPX_BLEND_A256_ROUND_BITS)
 
 // Blending by averaging.
-#define VPX_BLEND_AVG(v0, v1)   ROUND_POWER_OF_TWO((v0) + (v1), 1)
+#define VPX_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
 
 #endif  // VPX_DSP_BLEND_H_
diff --git a/vpx_dsp/blend_a64_hmask.c b/vpx_dsp/blend_a64_hmask.c
index 90f3415fffd4c6b81847a505a1e623d882ced1f6..46d73ffd289730d1a5d2b9cfe5d2e0aca5cbe2f8 100644
--- a/vpx_dsp/blend_a64_hmask.c
+++ b/vpx_dsp/blend_a64_hmask.c
@@ -17,11 +17,10 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-void vpx_blend_a64_hmask_c(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+void vpx_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int h, int w) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -34,19 +33,17 @@ void vpx_blend_a64_hmask_c(
 
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
-                                              src0[i * src0_stride + j],
-                                              src1[i * src1_stride + j]);
+      dst[i * dst_stride + j] = VPX_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
     }
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_blend_a64_hmask_c(
-    uint8_t *dst_8, uint32_t dst_stride,
-    const uint8_t *src0_8, uint32_t src0_stride,
-    const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+void vpx_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int h, int w, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -64,9 +61,8 @@ void vpx_highbd_blend_a64_hmask_c(
 
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
-                                              src0[i * src0_stride + j],
-                                              src1[i * src1_stride + j]);
+      dst[i * dst_stride + j] = VPX_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
     }
   }
 }
diff --git a/vpx_dsp/blend_a64_mask.c b/vpx_dsp/blend_a64_mask.c
index 1649798e404340e5cc4cd2dcb0b311904e991fb0..eee544c1db5db6478541ad83d09eb8b66c0d1c94 100644
--- a/vpx_dsp/blend_a64_mask.c
+++ b/vpx_dsp/blend_a64_mask.c
@@ -24,8 +24,8 @@
 void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
                           const uint8_t *src0, uint32_t src0_stride,
                           const uint8_t *src1, uint32_t src1_stride,
-                          const uint8_t *mask, uint32_t mask_stride,
-                          int h, int w, int subh, int subw) {
+                          const uint8_t *mask, uint32_t mask_stride, int h,
+                          int w, int subh, int subw) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -40,22 +40,20 @@ void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
         const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
   } else if (subw == 1 && subh == 1) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
-        const int m =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-                               2);
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
@@ -64,8 +62,7 @@ void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
       for (j = 0; j < w; ++j) {
         const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
                                     mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
@@ -74,8 +71,7 @@ void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
       for (j = 0; j < w; ++j) {
         const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
                                     mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
@@ -107,22 +103,20 @@ void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
         const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
   } else if (subw == 1 && subh == 1) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
-        const int m =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-                               2);
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
@@ -131,8 +125,7 @@ void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
       for (j = 0; j < w; ++j) {
         const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
                                     mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
@@ -141,8 +134,7 @@ void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
       for (j = 0; j < w; ++j) {
         const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
                                     mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                                src0[i * src0_stride + j],
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                                 src1[i * src1_stride + j]);
       }
     }
diff --git a/vpx_dsp/blend_a64_vmask.c b/vpx_dsp/blend_a64_vmask.c
index 5d48a8336cbb581b5436611e954558d80749433d..4a2ced75182483d666509aa63024ed13ed89901d 100644
--- a/vpx_dsp/blend_a64_vmask.c
+++ b/vpx_dsp/blend_a64_vmask.c
@@ -17,11 +17,10 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-void vpx_blend_a64_vmask_c(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+void vpx_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int h, int w) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -35,19 +34,17 @@ void vpx_blend_a64_vmask_c(
   for (i = 0; i < h; ++i) {
     const int m = mask[i];
     for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                              src0[i * src0_stride + j],
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                               src1[i * src1_stride + j]);
     }
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_blend_a64_vmask_c(
-    uint8_t *dst_8, uint32_t dst_stride,
-    const uint8_t *src0_8, uint32_t src0_stride,
-    const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+void vpx_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int h, int w, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -66,8 +63,7 @@ void vpx_highbd_blend_a64_vmask_c(
   for (i = 0; i < h; ++i) {
     const int m = mask[i];
     for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
-                                              src0[i * src0_stride + j],
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j],
                                               src1[i * src1_stride + j]);
     }
   }
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
index aba99d7a665d8be4059843f4741ac104c5b9bad5..589b124e26a44f90f370b7b35964217ecd49bfff 100644
--- a/vpx_dsp/deblock.c
+++ b/vpx_dsp/deblock.c
@@ -10,26 +10,32 @@
 #include <stdlib.h>
 #include "vpx/vpx_integer.h"
 
-const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
-    14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
-    8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
-    13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
-    8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
-    4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
-    4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
-    10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
-    5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
-    11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
-    10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
-    8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
-    2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
-    1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
-    7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
-    5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
-    0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
-    10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
-    4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
-    13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
+const int16_t vpx_rv[] = {
+  8,  5,  2,  2,  8,  12, 4,  9,  8,  3,  0,  3,  9,  0,  0,  0,  8,  3,  14,
+  4,  10, 1,  11, 14, 1,  14, 9,  6,  12, 11, 8,  6,  10, 0,  0,  8,  9,  0,
+  3,  14, 8,  11, 13, 4,  2,  9,  0,  3,  9,  6,  1,  2,  3,  14, 13, 1,  8,
+  2,  9,  7,  3,  3,  1,  13, 13, 6,  6,  5,  2,  7,  11, 9,  11, 8,  7,  3,
+  2,  0,  13, 13, 14, 4,  12, 5,  12, 10, 8,  10, 13, 10, 4,  14, 4,  10, 0,
+  8,  11, 1,  13, 7,  7,  14, 6,  14, 13, 2,  13, 5,  4,  4,  0,  10, 0,  5,
+  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,  7,  2,  2,  5,  3,  4,  7,
+  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,  0,  11, 8,  13, 1,  13, 1,
+  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,  1,  13, 14, 7,  6,  7,  9,
+  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,  8,  7,  10, 0,  8,  14, 11,
+  3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12, 12, 8,  0,  11, 13, 1,  2,
+  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,  3,  10, 5,  8,  0,  11, 6,
+  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,  4,  3,  5,  6,  10, 8,  9,
+  4,  11, 14, 0,  10, 0,  5,  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,
+  7,  2,  2,  5,  3,  4,  7,  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,
+  0,  11, 8,  13, 1,  13, 1,  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,
+  1,  13, 14, 7,  6,  7,  9,  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,
+  8,  7,  10, 0,  8,  14, 11, 3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12,
+  12, 8,  0,  11, 13, 1,  2,  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,
+  3,  10, 5,  8,  0,  11, 6,  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,
+  4,  3,  5,  6,  10, 8,  9,  4,  11, 14, 3,  8,  3,  7,  8,  5,  11, 4,  12,
+  3,  11, 9,  14, 8,  14, 13, 4,  3,  1,  2,  14, 6,  5,  4,  4,  11, 4,  6,
+  2,  1,  5,  8,  8,  12, 13, 5,  14, 10, 12, 13, 0,  9,  5,  5,  11, 10, 13,
+  9,  10, 13,
+};
 
 void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
                                             unsigned char *dst_ptr,
@@ -55,8 +61,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
 
       v = p_src[col];
 
-      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
-          && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
+          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_above2 + p_above1 + 1) >> 1;
         k2 = (p_below2 + p_below1 + 1) >> 1;
@@ -77,10 +83,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     for (col = 0; col < cols; col++) {
       v = p_src[col];
 
-      if ((abs(v - p_src[col - 2]) < f[col])
-          && (abs(v - p_src[col - 1]) < f[col])
-          && (abs(v - p_src[col + 1]) < f[col])
-          && (abs(v - p_src[col + 2]) < f[col])) {
+      if ((abs(v - p_src[col - 2]) < f[col]) &&
+          (abs(v - p_src[col - 1]) < f[col]) &&
+          (abs(v - p_src[col + 1]) < f[col]) &&
+          (abs(v - p_src[col + 2]) < f[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
         k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
@@ -90,8 +96,7 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
 
       d[col & 3] = v;
 
-      if (col >= 2)
-        p_dst[col - 2] = d[(col - 2) & 3];
+      if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
     }
 
     /* handle the last two pixels */
@@ -115,14 +120,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
     int sumsq = 0;
     int sum = 0;
 
-    for (i = -8; i < 0; i++)
-      s[i] = s[0];
+    for (i = -8; i < 0; i++) s[i] = s[0];
 
     /* 17 avoids valgrind warning - we buffer values in c in d
      * and only write them when we've read 8 ahead...
      */
-    for (i = 0; i < 17; i++)
-      s[i + cols] = s[cols - 1];
+    for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
 
     for (i = -8; i <= 6; i++) {
       sumsq += s[i] * s[i];
@@ -162,14 +165,12 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
     unsigned char d[16];
     const int16_t *rv2 = rv3 + ((c * 17) & 127);
 
-    for (i = -8; i < 0; i++)
-      s[i * pitch] = s[0];
+    for (i = -8; i < 0; i++) s[i * pitch] = s[0];
 
     /* 17 avoids valgrind warning - we buffer values in c in d
      * and only write them when we've read 8 ahead...
      */
-    for (i = 0; i < 17; i++)
-      s[(i + rows) * pitch] = s[(rows - 1) * pitch];
+    for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
 
     for (i = -8; i <= 6; i++) {
       sumsq += s[i * pitch] * s[i * pitch];
@@ -184,10 +185,8 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
       if (sumsq * 15 - sum * sum < flimit) {
         d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
       }
-      if (r >= 8)
-        s[-8 * pitch] = d[(r - 8) & 15];
+      if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
       s += pitch;
     }
   }
 }
-
diff --git a/vpx_dsp/fastssim.c b/vpx_dsp/fastssim.c
index 7d90891714373c9fe22d41dfae2243b8bc72705a..4d5eb5a6ff12594a9c84fceb5ffb385bbe4a6409 100644
--- a/vpx_dsp/fastssim.c
+++ b/vpx_dsp/fastssim.c
@@ -55,12 +55,12 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
   int l;
   lw = (_w + 1) >> 1;
   lh = (_h + 1) >> 1;
-  data_size = _nlevels * sizeof(fs_level)
-      + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+  data_size =
+      _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
   for (l = 0; l < _nlevels; l++) {
     size_t im_size;
     size_t level_size;
-    im_size = lw * (size_t) lh;
+    im_size = lw * (size_t)lh;
     level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
     level_size += sizeof(*_ctx->level[l].ssim) - 1;
     level_size /= sizeof(*_ctx->level[l].ssim);
@@ -70,8 +70,8 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lw = (lw + 1) >> 1;
     lh = (lh + 1) >> 1;
   }
-  data = (unsigned char *) malloc(data_size);
-  _ctx->level = (fs_level *) data;
+  data = (unsigned char *)malloc(data_size);
+  _ctx->level = (fs_level *)data;
   _ctx->nlevels = _nlevels;
   data += _nlevels * sizeof(*_ctx->level);
   lw = (_w + 1) >> 1;
@@ -81,7 +81,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     size_t level_size;
     _ctx->level[l].w = lw;
     _ctx->level[l].h = lh;
-    im_size = lw * (size_t) lh;
+    im_size = lw * (size_t)lh;
     level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
     level_size += sizeof(*_ctx->level[l].ssim) - 1;
     level_size /= sizeof(*_ctx->level[l].ssim);
@@ -89,17 +89,15 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     _ctx->level[l].im1 = (uint32_t *)data;
     _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
     data += level_size;
-    _ctx->level[l].ssim = (double *) data;
+    _ctx->level[l].ssim = (double *)data;
     data += im_size * sizeof(*_ctx->level[l].ssim);
     lw = (lw + 1) >> 1;
     lh = (lh + 1) >> 1;
   }
-  _ctx->col_buf = (unsigned *) data;
+  _ctx->col_buf = (unsigned *)data;
 }
 
-static void fs_ctx_clear(fs_ctx *_ctx) {
-  free(_ctx->level);
-}
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
 
 static void fs_downsample_level(fs_ctx *_ctx, int _l) {
   const uint32_t *src1;
@@ -130,18 +128,18 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
       int i1;
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, w2);
-      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1]
-          + src1[j1offs + i0] + src1[j1offs + i1];
-      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1]
-          + src2[j1offs + i0] + src2[j1offs + i1];
+      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+                        src1[j1offs + i0] + src1[j1offs + i1];
+      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+                        src2[j1offs + i0] + src2[j1offs + i1];
     }
   }
 }
 
 static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
                                  int _s1ystride, const uint8_t *_src2,
-                                 int _s2ystride, int _w, int _h,
-                                 uint32_t bd, uint32_t shift) {
+                                 int _s2ystride, int _w, int _h, uint32_t bd,
+                                 uint32_t shift) {
   uint32_t *dst1;
   uint32_t *dst2;
   int w;
@@ -163,23 +161,23 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, _w);
       if (bd == 8 && shift == 0) {
-        dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
-            + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
-            + _src1[j1 * _s1ystride + i1];
-        dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
-            + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
-            + _src2[j1 * _s2ystride + i1];
+        dst1[j * w + i] =
+            _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+            _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+        dst2[j * w + i] =
+            _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+            _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
       } else {
-        uint16_t * src1s = CONVERT_TO_SHORTPTR(_src1);
-        uint16_t * src2s = CONVERT_TO_SHORTPTR(_src2);
-        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift)
-              + (src1s[j0 * _s1ystride + i1] >> shift)
-              + (src1s[j1 * _s1ystride + i0] >> shift)
-              + (src1s[j1 * _s1ystride + i1] >> shift);
-        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift)
-              + (src2s[j0 * _s2ystride + i1] >> shift)
-              + (src2s[j1 * _s2ystride + i0] >> shift)
-              + (src2s[j1 * _s2ystride + i1] >> shift);
+        uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+        uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+                          (src1s[j0 * _s1ystride + i1] >> shift) +
+                          (src1s[j1 * _s1ystride + i0] >> shift) +
+                          (src1s[j1 * _s1ystride + i1] >> shift);
+        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+                          (src2s[j0 * _s2ystride + i1] >> shift) +
+                          (src2s[j1 * _s2ystride + i0] >> shift) +
+                          (src2s[j1 * _s2ystride + i1] >> shift);
       }
     }
   }
@@ -200,10 +198,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   int j;
   double ssim_c1 = SSIM_C1;
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (bit_depth == 10)
-    ssim_c1 = SSIM_C1_10;
-  if (bit_depth == 12)
-    ssim_c1 = SSIM_C1_12;
+  if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+  if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
 #else
   assert(bit_depth == 8);
 #endif
@@ -213,19 +209,15 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   col_sums_y = col_sums_x + w;
   im1 = _ctx->level[_l].im1;
   im2 = _ctx->level[_l].im2;
-  for (i = 0; i < w; i++)
-    col_sums_x[i] = 5 * im1[i];
-  for (i = 0; i < w; i++)
-    col_sums_y[i] = 5 * im2[i];
+  for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+  for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
   for (j = 1; j < 4; j++) {
     j1offs = FS_MINI(j, h - 1) * w;
-    for (i = 0; i < w; i++)
-      col_sums_x[i] += im1[j1offs + i];
-    for (i = 0; i < w; i++)
-      col_sums_y[i] += im2[j1offs + i];
+    for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+    for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
   }
   ssim = _ctx->level[_l].ssim;
-  c1 = (double) (ssim_c1 * 4096 * (1 << 4 * _l));
+  c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
   for (j = 0; j < h; j++) {
     unsigned mux;
     unsigned muy;
@@ -239,8 +231,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
       muy += col_sums_y[i1];
     }
     for (i = 0; i < w; i++) {
-      ssim[j * w + i] *= (2 * mux * (double) muy + c1)
-          / (mux * (double) mux + muy * (double) muy + c1);
+      ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+                         (mux * (double)mux + muy * (double)muy + c1);
       if (i + 1 < w) {
         i0 = FS_MAXI(0, i - 4);
         i1 = FS_MINI(i + 4, w - 1);
@@ -250,78 +242,68 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
     }
     if (j + 1 < h) {
       j0offs = FS_MAXI(0, j - 4) * w;
-      for (i = 0; i < w; i++)
-        col_sums_x[i] -= im1[j0offs + i];
-      for (i = 0; i < w; i++)
-        col_sums_y[i] -= im2[j0offs + i];
+      for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
       j1offs = FS_MINI(j + 4, h - 1) * w;
-      for (i = 0; i < w; i++)
-        col_sums_x[i] += im1[j1offs + i];
-      for (i = 0; i < w; i++)
-        col_sums_y[i] += im2[j1offs + i];
+      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
     }
   }
 }
 
-#define FS_COL_SET(_col, _joffs, _ioffs) \
-  do { \
-    unsigned gx; \
-    unsigned gy; \
+#define FS_COL_SET(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
     gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
     gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    col_sums_gx2[(_col)] = gx * (double)gx; \
-    col_sums_gy2[(_col)] = gy * (double)gy; \
-    col_sums_gxgy[(_col)] = gx * (double)gy; \
-  } \
-  while (0)
+    col_sums_gx2[(_col)] = gx * (double)gx;                    \
+    col_sums_gy2[(_col)] = gy * (double)gy;                    \
+    col_sums_gxgy[(_col)] = gx * (double)gy;                   \
+  } while (0)
 
-#define FS_COL_ADD(_col, _joffs, _ioffs) \
-  do { \
-    unsigned gx; \
-    unsigned gy; \
+#define FS_COL_ADD(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
     gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
     gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    col_sums_gx2[(_col)] += gx * (double)gx; \
-    col_sums_gy2[(_col)] += gy * (double)gy; \
-    col_sums_gxgy[(_col)] += gx * (double)gy; \
-  } \
-  while (0)
+    col_sums_gx2[(_col)] += gx * (double)gx;                   \
+    col_sums_gy2[(_col)] += gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] += gx * (double)gy;                  \
+  } while (0)
 
-#define FS_COL_SUB(_col, _joffs, _ioffs) \
-  do { \
-    unsigned gx; \
-    unsigned gy; \
+#define FS_COL_SUB(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
     gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
     gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    col_sums_gx2[(_col)] -= gx * (double)gx; \
-    col_sums_gy2[(_col)] -= gy * (double)gy; \
-    col_sums_gxgy[(_col)] -= gx * (double)gy; \
-  } \
-  while (0)
+    col_sums_gx2[(_col)] -= gx * (double)gx;                   \
+    col_sums_gy2[(_col)] -= gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] -= gx * (double)gy;                  \
+  } while (0)
 
-#define FS_COL_COPY(_col1, _col2) \
-  do { \
-    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
-    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+#define FS_COL_COPY(_col1, _col2)                    \
+  do {                                               \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)];   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)];   \
     col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
-  } \
-  while (0)
+  } while (0)
 
-#define FS_COL_HALVE(_col1, _col2) \
-  do { \
-    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
-    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+#define FS_COL_HALVE(_col1, _col2)                         \
+  do {                                                     \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5;   \
     col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
-  } \
-  while (0)
+  } while (0)
 
-#define FS_COL_DOUBLE(_col1, _col2) \
-  do { \
-    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
-    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+#define FS_COL_DOUBLE(_col1, _col2)                      \
+  do {                                                   \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2;   \
     col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
-  } \
-  while (0)
+  } while (0)
 
 static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
   uint32_t *im1;
@@ -340,10 +322,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
   int j;
   double ssim_c2 = SSIM_C2;
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (bit_depth == 10)
-    ssim_c2 = SSIM_C2_10;
-  if (bit_depth == 12)
-    ssim_c2 = SSIM_C2_12;
+  if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+  if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
 #else
   assert(bit_depth == 8);
 #endif
@@ -398,14 +378,11 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
         double mugy2;
         double mugxgy;
         mugx2 = col_sums_gx2[0];
-        for (k = 1; k < 8; k++)
-          mugx2 += col_sums_gx2[k];
+        for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
         mugy2 = col_sums_gy2[0];
-        for (k = 1; k < 8; k++)
-          mugy2 += col_sums_gy2[k];
+        for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
         mugxgy = col_sums_gxgy[0];
-        for (k = 1; k < 8; k++)
-          mugxgy += col_sums_gxgy[k];
+        for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
         ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
         if (i + 1 < w) {
           FS_COL_SET(0, -1, 1);
@@ -440,8 +417,9 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
  Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
  We drop the finest scale and renormalize the rest to sum to 1.*/
 
-static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625,
-    0.3141326904296875, 0.2473602294921875, 0.1395416259765625};
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+  0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
 
 static double fs_average(fs_ctx *_ctx, int _l) {
   double *ssim;
@@ -455,28 +433,26 @@ static double fs_average(fs_ctx *_ctx, int _l) {
   ssim = _ctx->level[_l].ssim;
   ret = 0;
   for (j = 0; j < h; j++)
-    for (i = 0; i < w; i++)
-      ret += ssim[j * w + i];
+    for (i = 0; i < w; i++) ret += ssim[j * w + i];
   return pow(ret / (w * h), FS_WEIGHTS[_l]);
 }
 
 static double convert_ssim_db(double _ssim, double _weight) {
   assert(_weight >= _ssim);
-  if ((_weight - _ssim) < 1e-10)
-    return MAX_SSIM_DB;
+  if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
   return 10 * (log10(_weight) - log10(_weight - _ssim));
 }
 
-static double calc_ssim(const uint8_t *_src, int _systride,
-                        const uint8_t *_dst, int _dystride,
-                        int _w, int _h, uint32_t _bd, uint32_t _shift) {
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+                        int _dystride, int _w, int _h, uint32_t _bd,
+                        uint32_t _shift) {
   fs_ctx ctx;
   double ret;
   int l;
   ret = 1;
   fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
-  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride,
-                       _w, _h, _bd, _shift);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
+                       _shift);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
     fs_calc_structure(&ctx, l, _bd);
     ret *= fs_average(&ctx, l);
@@ -490,9 +466,9 @@ static double calc_ssim(const uint8_t *_src, int _systride,
 }
 
 double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
-                         const YV12_BUFFER_CONFIG *dest,
-                         double *ssim_y, double *ssim_u, double *ssim_v,
-                         uint32_t bd, uint32_t in_bd) {
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd) {
   double ssimv;
   uint32_t bd_shift = 0;
   vpx_clear_system_state();
diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
index 4c0d5db83760beb1b69789caf228e02fcd3e8ec7..4e7d4053ea9ceb20d2e89eff8d88843b9d3f74ee 100644
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -72,8 +72,7 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   {
     int i, j;
     for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+      for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
     }
   }
 }
@@ -82,8 +81,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
   tran_low_t sum = 0;
   for (r = 0; r < 4; ++r)
-    for (c = 0; c < 4; ++c)
-      sum += input[r * stride + c];
+    for (c = 0; c < 4; ++c) sum += input[r * stride + c];
 
   output[0] = sum << 1;
 }
@@ -133,8 +131,8 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       x3 = s0 - s3;
       t0 = (x0 + x1) * cospi_16_64;
       t1 = (x0 - x1) * cospi_16_64;
-      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
       output[0] = (tran_low_t)fdct_round_shift(t0);
       output[2] = (tran_low_t)fdct_round_shift(t2);
       output[4] = (tran_low_t)fdct_round_shift(t1);
@@ -153,24 +151,23 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       x3 = s7 + t3;
 
       // Stage 4
-      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
       output[1] = (tran_low_t)fdct_round_shift(t0);
       output[3] = (tran_low_t)fdct_round_shift(t2);
       output[5] = (tran_low_t)fdct_round_shift(t1);
       output[7] = (tran_low_t)fdct_round_shift(t3);
       output += 8;
     }
-    in  = intermediate;
+    in = intermediate;
     output = final_output;
   }
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      final_output[j + i * 8] /= 2;
+    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
   }
 }
 
@@ -178,8 +175,7 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
   tran_low_t sum = 0;
   for (r = 0; r < 8; ++r)
-    for (c = 0; c < 8; ++c)
-      sum += input[r * stride + c];
+    for (c = 0; c < 8; ++c) sum += input[r * stride + c];
 
   output[0] = sum;
 }
@@ -214,11 +210,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
         input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
         input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
-        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
-        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4;
         // Calculate input for the next 8 results.
-        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
-        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4;
         step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
         step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
         step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
@@ -233,11 +229,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
         input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
         input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
-        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
-        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2);
         // Calculate input for the next 8 results.
-        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
-        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2);
         step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
         step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
         step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
@@ -268,7 +264,7 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         x3 = s0 - s3;
         t0 = (x0 + x1) * cospi_16_64;
         t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
         out[0] = (tran_low_t)fdct_round_shift(t0);
         out[4] = (tran_low_t)fdct_round_shift(t2);
@@ -288,10 +284,10 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         x3 = s7 + t3;
 
         // Stage 4
-        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
         out[2] = (tran_low_t)fdct_round_shift(t0);
         out[6] = (tran_low_t)fdct_round_shift(t2);
         out[10] = (tran_low_t)fdct_round_shift(t1);
@@ -318,12 +314,12 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         step3[6] = step1[6] + step2[5];
         step3[7] = step1[7] + step2[4];
         // step 4
-        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+        temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
         step2[1] = fdct_round_shift(temp1);
         step2[2] = fdct_round_shift(temp2);
         temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
         step2[5] = fdct_round_shift(temp1);
         step2[6] = fdct_round_shift(temp2);
         // step 5
@@ -336,20 +332,20 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         step1[6] = step3[7] - step2[6];
         step1[7] = step3[7] + step2[6];
         // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
         out[1] = (tran_low_t)fdct_round_shift(temp1);
         out[9] = (tran_low_t)fdct_round_shift(temp2);
         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
         out[5] = (tran_low_t)fdct_round_shift(temp1);
         out[13] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
         out[3] = (tran_low_t)fdct_round_shift(temp1);
         out[11] = (tran_low_t)fdct_round_shift(temp2);
         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
         out[7] = (tran_low_t)fdct_round_shift(temp1);
         out[15] = (tran_low_t)fdct_round_shift(temp2);
       }
@@ -368,8 +364,7 @@ void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
   int sum = 0;
   for (r = 0; r < 16; ++r)
-    for (c = 0; c < 16; ++c)
-      sum += input[r * stride + c];
+    for (c = 0; c < 16; ++c) sum += input[r * stride + c];
 
   output[0] = (tran_low_t)(sum >> 1);
 }
@@ -675,36 +670,36 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
   step[31] = output[31] + output[30];
 
   // Final stage --- outputs indices are bit-reversed.
-  output[0]  = step[0];
+  output[0] = step[0];
   output[16] = step[1];
-  output[8]  = step[2];
+  output[8] = step[2];
   output[24] = step[3];
-  output[4]  = step[4];
+  output[4] = step[4];
   output[20] = step[5];
   output[12] = step[6];
   output[28] = step[7];
-  output[2]  = step[8];
+  output[2] = step[8];
   output[18] = step[9];
   output[10] = step[10];
   output[26] = step[11];
-  output[6]  = step[12];
+  output[6] = step[12];
   output[22] = step[13];
   output[14] = step[14];
   output[30] = step[15];
 
-  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+  output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
   output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
-  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+  output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
   output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
-  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+  output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
   output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
   output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
   output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
-  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+  output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
   output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
   output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
   output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
-  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+  output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
   output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
   output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
@@ -717,8 +712,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
   // Columns
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
+    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -727,8 +721,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] =
@@ -746,8 +739,7 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
   // Columns
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
+    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing
@@ -759,11 +751,9 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] = (tran_low_t)temp_out[j];
+    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
   }
 }
 
@@ -771,8 +761,7 @@ void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
   int sum = 0;
   for (r = 0; r < 32; ++r)
-    for (c = 0; c < 32; ++c)
-      sum += input[r * stride + c];
+    for (c = 0; c < 32; ++c) sum += input[r * stride + c];
 
   output[0] = (tran_low_t)(sum >> 3);
 }
diff --git a/vpx_dsp/intrapred.c b/vpx_dsp/intrapred.c
index b1076f8f01aaf771e970791927c9d713311509a2..4179e0f78efba6514949710bf2361782cc4e7729 100644
--- a/vpx_dsp/intrapred.c
+++ b/vpx_dsp/intrapred.c
@@ -14,17 +14,16 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 
-#define DST(x, y) dst[(x) + (y) * stride]
+#define DST(x, y) dst[(x) + (y)*stride]
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
-  (void) above;
+  (void)above;
   // first column
-  for (r = 0; r < bs - 1; ++r)
-    dst[r * stride] = AVG2(left[r], left[r + 1]);
+  for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]);
   dst[(bs - 1) * stride] = left[bs - 1];
   dst++;
 
@@ -36,8 +35,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   dst++;
 
   // rest of last row
-  for (c = 0; c < bs - 2; ++c)
-    dst[(bs - 1) * stride + c] = left[bs - 1];
+  for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
 
   for (r = bs - 2; r >= 0; --r)
     for (c = 0; c < bs - 2; ++c)
@@ -47,13 +45,13 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                    const uint8_t *above, const uint8_t *left) {
   int r, c;
-  (void) above;
+  (void)above;
 
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c) {
       dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
                             left[(c >> 1) + r + 2])
-          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
     }
     dst += stride;
   }
@@ -79,12 +77,12 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
-  (void) left;
+  (void)left;
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c) {
       dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
                             above[(r >> 1) + c + 2])
-          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
     }
     dst += stride;
   }
@@ -112,7 +110,7 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
-  (void) left;
+  (void)left;
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c) {
       dst[c] = AVG3(above[r + c], above[r + c + 1],
@@ -127,14 +125,12 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   int r, c;
 
   // first row
-  for (c = 0; c < bs; c++)
-    dst[c] = AVG2(above[c - 1], above[c]);
+  for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
   dst += stride;
 
   // second row
   dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++)
-    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
   dst += stride;
 
   // the rest of first col
@@ -144,8 +140,7 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 
   // the rest of the block
   for (r = 2; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-2 * stride + c - 1];
+    for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
     dst += stride;
   }
 }
@@ -182,8 +177,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
   dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bs; r++)
-    dst[r * stride] = AVG2(left[r - 1], left[r]);
+  for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
   dst++;
 
   dst[0] = AVG3(left[0], above[-1], above[0]);
@@ -197,8 +191,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   dst += stride;
 
   for (r = 1; r < bs; ++r) {
-    for (c = 0; c < bs - 2; c++)
-      dst[c] = dst[-stride + c - 2];
+    for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
     dst += stride;
   }
 }
@@ -206,7 +199,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                const uint8_t *above, const uint8_t *left) {
   int r;
-  (void) left;
+  (void)left;
 
   for (r = 0; r < bs; r++) {
     memcpy(dst, above, bs);
@@ -217,7 +210,7 @@ static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                const uint8_t *above, const uint8_t *left) {
   int r;
-  (void) above;
+  (void)above;
 
   for (r = 0; r < bs; r++) {
     memset(dst, left[r], bs);
@@ -240,8 +233,8 @@ static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                     const uint8_t *above, const uint8_t *left) {
   int r;
-  (void) above;
-  (void) left;
+  (void)above;
+  (void)left;
 
   for (r = 0; r < bs; r++) {
     memset(dst, 128, bs);
@@ -253,10 +246,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   int i, r, expected_dc, sum = 0;
-  (void) above;
+  (void)above;
 
-  for (i = 0; i < bs; i++)
-    sum += left[i];
+  for (i = 0; i < bs; i++) sum += left[i];
   expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
@@ -268,10 +260,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                     const uint8_t *above, const uint8_t *left) {
   int i, r, expected_dc, sum = 0;
-  (void) left;
+  (void)left;
 
-  for (i = 0; i < bs; i++)
-    sum += above[i];
+  for (i = 0; i < bs; i++) sum += above[i];
   expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
@@ -338,14 +329,13 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   const int K = left[2];
   const int L = left[3];
   (void)above;
-  DST(0, 0) =             AVG2(I, J);
+  DST(0, 0) = AVG2(I, J);
   DST(2, 0) = DST(0, 1) = AVG2(J, K);
   DST(2, 1) = DST(0, 2) = AVG2(K, L);
-  DST(1, 0) =             AVG3(I, J, K);
+  DST(1, 0) = AVG3(I, J, K);
   DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
   DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
-  DST(3, 2) = DST(2, 2) =
-      DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+  DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
 void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -358,17 +348,17 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   const int F = above[5];
   const int G = above[6];
   (void)left;
-  DST(0, 0) =             AVG2(A, B);
+  DST(0, 0) = AVG2(A, B);
   DST(1, 0) = DST(0, 2) = AVG2(B, C);
   DST(2, 0) = DST(1, 2) = AVG2(C, D);
   DST(3, 0) = DST(2, 2) = AVG2(D, E);
-              DST(3, 2) = AVG2(E, F);  // differs from vp8
+  DST(3, 2) = AVG2(E, F);  // differs from vp8
 
-  DST(0, 1) =             AVG3(A, B, C);
+  DST(0, 1) = AVG3(A, B, C);
   DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
   DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
   DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
-              DST(3, 3) = AVG3(E, F, G);  // differs from vp8
+  DST(3, 3) = AVG3(E, F, G);  // differs from vp8
 }
 
 void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -382,17 +372,17 @@ void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   const int G = above[6];
   const int H = above[7];
   (void)left;
-  DST(0, 0) =             AVG2(A, B);
+  DST(0, 0) = AVG2(A, B);
   DST(1, 0) = DST(0, 2) = AVG2(B, C);
   DST(2, 0) = DST(1, 2) = AVG2(C, D);
   DST(3, 0) = DST(2, 2) = AVG2(D, E);
-              DST(3, 2) = AVG3(E, F, G);
+  DST(3, 2) = AVG3(E, F, G);
 
-  DST(0, 1) =             AVG3(A, B, C);
+  DST(0, 1) = AVG3(A, B, C);
   DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
   DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
   DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
-              DST(3, 3) = AVG3(F, G, H);
+  DST(3, 3) = AVG3(F, G, H);
 }
 
 void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -407,13 +397,13 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   const int H = above[7];
   (void)stride;
   (void)left;
-  DST(0, 0)                                     = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
-  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(0, 0) = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
-                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
-                                      DST(3, 3) = H;  // differs from vp8
+  DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+  DST(3, 3) = H;  // differs from vp8
 }
 
 void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -428,13 +418,13 @@ void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   const int H = above[7];
   (void)stride;
   (void)left;
-  DST(0, 0)                                     = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
-  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(0, 0) = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
-                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
-                                      DST(3, 3) = AVG3(G, H, H);
+  DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+  DST(3, 3) = AVG3(G, H, H);
 }
 
 void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -450,14 +440,14 @@ void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   DST(0, 0) = DST(1, 2) = AVG2(X, A);
   DST(1, 0) = DST(2, 2) = AVG2(A, B);
   DST(2, 0) = DST(3, 2) = AVG2(B, C);
-  DST(3, 0)             = AVG2(C, D);
+  DST(3, 0) = AVG2(C, D);
 
-  DST(0, 3) =             AVG3(K, J, I);
-  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 3) = AVG3(K, J, I);
+  DST(0, 2) = AVG3(J, I, X);
   DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
   DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
   DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
-  DST(3, 1) =             AVG3(B, C, D);
+  DST(3, 1) = AVG3(B, C, D);
 }
 
 void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -472,13 +462,13 @@ void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   const int C = above[2];
   const int D = above[3];
   (void)stride;
-  DST(0, 3)                                     = AVG3(J, K, L);
-  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
-  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
+  DST(0, 3) = AVG3(J, K, L);
+  DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
   DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
-                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
-                                      DST(3, 0) = AVG3(D, C, B);
+  DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+  DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+  DST(3, 0) = AVG3(D, C, B);
 }
 
 void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -495,14 +485,14 @@ void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   DST(0, 0) = DST(2, 1) = AVG2(I, X);
   DST(0, 1) = DST(2, 2) = AVG2(J, I);
   DST(0, 2) = DST(2, 3) = AVG2(K, J);
-  DST(0, 3)             = AVG2(L, K);
+  DST(0, 3) = AVG2(L, K);
 
-  DST(3, 0)             = AVG3(A, B, C);
-  DST(2, 0)             = AVG3(X, A, B);
+  DST(3, 0) = AVG3(A, B, C);
+  DST(2, 0) = AVG3(X, A, B);
   DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
   DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
   DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
-  DST(1, 3)             = AVG3(L, K, J);
+  DST(1, 3) = AVG3(L, K, J);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -510,8 +500,8 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   int r, c;
-  (void) above;
-  (void) bd;
+  (void)above;
+  (void)bd;
 
   // First column.
   for (r = 0; r < bs - 1; ++r) {
@@ -529,8 +519,7 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
   dst++;
 
   // Rest of last row.
-  for (c = 0; c < bs - 2; ++c)
-    dst[(bs - 1) * stride + c] = left[bs - 1];
+  for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
 
   for (r = bs - 2; r >= 0; --r) {
     for (c = 0; c < bs - 2; ++c)
@@ -542,30 +531,30 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bs, const uint16_t *above,
                                           const uint16_t *left, int bd) {
   int r, c;
-  (void) above;
-  (void) bd;
+  (void)above;
+  (void)bd;
 
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c) {
       dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
                             left[(c >> 1) + r + 2])
-          : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
+                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
     }
     dst += stride;
   }
 }
 
-static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
-                                        int bs, const uint16_t *above,
+static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
                                         const uint16_t *left, int bd) {
   int r, c;
-  (void) left;
-  (void) bd;
+  (void)left;
+  (void)bd;
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c) {
       dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
                             above[(r >> 1) + c + 2])
-          : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
+                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
     }
     dst += stride;
   }
@@ -577,13 +566,13 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
   int r, c;
-  (void) left;
-  (void) bd;
+  (void)left;
+  (void)bd;
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c) {
-      dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1],
-                                         above[r + c + 2])
-          : above[bs * 2 - 1];
+      dst[c] = r + c + 2 < bs * 2
+                   ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
+                   : above[bs * 2 - 1];
     }
     dst += stride;
   }
@@ -593,8 +582,8 @@ static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   int r, c;
-  (void) left;
-  (void) bd;
+  (void)left;
+  (void)bd;
   for (r = 0; r < bs; ++r) {
     for (c = 0; c < bs; ++c) {
       dst[c] = AVG3(above[r + c], above[r + c + 1],
@@ -608,17 +597,15 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   int r, c;
-  (void) bd;
+  (void)bd;
 
   // first row
-  for (c = 0; c < bs; c++)
-    dst[c] = AVG2(above[c - 1], above[c]);
+  for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]);
   dst += stride;
 
   // second row
   dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++)
-    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
   dst += stride;
 
   // the rest of first col
@@ -628,8 +615,7 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
 
   // the rest of the block
   for (r = 2; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-2 * stride + c - 1];
+    for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1];
     dst += stride;
   }
 }
@@ -638,10 +624,9 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   int r, c;
-  (void) bd;
+  (void)bd;
   dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++)
-    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 
   dst[stride] = AVG3(above[-1], left[0], left[1]);
   for (r = 2; r < bs; ++r)
@@ -649,8 +634,7 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
 
   dst += stride;
   for (r = 1; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-stride + c - 1];
+    for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
     dst += stride;
   }
 }
@@ -659,10 +643,9 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   int r, c;
-  (void) bd;
+  (void)bd;
   dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bs; r++)
-    dst[r * stride] = AVG2(left[r - 1], left[r]);
+  for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
   dst++;
 
   dst[0] = AVG3(left[0], above[-1], above[0]);
@@ -676,42 +659,41 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
   dst += stride;
 
   for (r = 1; r < bs; ++r) {
-    for (c = 0; c < bs - 2; c++)
-      dst[c] = dst[-stride + c - 2];
+    for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2];
     dst += stride;
   }
 }
 
-static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
-                                      int bs, const uint16_t *above,
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
                                       const uint16_t *left, int bd) {
   int r;
-  (void) left;
-  (void) bd;
+  (void)left;
+  (void)bd;
   for (r = 0; r < bs; r++) {
     memcpy(dst, above, bs * sizeof(uint16_t));
     dst += stride;
   }
 }
 
-static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
-                                      int bs, const uint16_t *above,
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
                                       const uint16_t *left, int bd) {
   int r;
-  (void) above;
-  (void) bd;
+  (void)above;
+  (void)bd;
   for (r = 0; r < bs; r++) {
     vpx_memset16(dst, left[r], bs);
     dst += stride;
   }
 }
 
-static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride,
-                                       int bs, const uint16_t *above,
+static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
                                        const uint16_t *left, int bd) {
   int r, c;
   int ytop_left = above[-1];
-  (void) bd;
+  (void)bd;
 
   for (r = 0; r < bs; r++) {
     for (c = 0; c < bs; c++)
@@ -724,8 +706,8 @@ static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bs, const uint16_t *above,
                                            const uint16_t *left, int bd) {
   int r;
-  (void) above;
-  (void) left;
+  (void)above;
+  (void)left;
 
   for (r = 0; r < bs; r++) {
     vpx_memset16(dst, 128 << (bd - 8), bs);
@@ -737,11 +719,10 @@ static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
                                             int bs, const uint16_t *above,
                                             const uint16_t *left, int bd) {
   int i, r, expected_dc, sum = 0;
-  (void) above;
-  (void) bd;
+  (void)above;
+  (void)bd;
 
-  for (i = 0; i < bs; i++)
-    sum += left[i];
+  for (i = 0; i < bs; i++) sum += left[i];
   expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
@@ -754,11 +735,10 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bs, const uint16_t *above,
                                            const uint16_t *left, int bd) {
   int i, r, expected_dc, sum = 0;
-  (void) left;
-  (void) bd;
+  (void)left;
+  (void)bd;
 
-  for (i = 0; i < bs; i++)
-    sum += above[i];
+  for (i = 0; i < bs; i++) sum += above[i];
   expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
@@ -767,12 +747,12 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
-                                       int bs, const uint16_t *above,
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
                                        const uint16_t *left, int bd) {
   int i, r, expected_dc, sum = 0;
   const int count = 2 * bs;
-  (void) bd;
+  (void)bd;
 
   for (i = 0; i < bs; i++) {
     sum += above[i];
@@ -791,22 +771,22 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride,
 // This serves as a wrapper function, so that all the prediction functions
 // can be unified and accessed as a pointer array. Note that the boundary
 // above and left are not necessarily used all the time.
-#define intra_pred_sized(type, size) \
-  void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
-                                                  ptrdiff_t stride, \
-                                                  const uint8_t *above, \
-                                                  const uint8_t *left) { \
-    type##_predictor(dst, stride, size, above, left); \
+#define intra_pred_sized(type, size)                        \
+  void vpx_##type##_predictor_##size##x##size##_c(          \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
+      const uint8_t *left) {                                \
+    type##_predictor(dst, stride, size, above, left);       \
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define intra_pred_highbd_sized(type, size) \
-  void vpx_highbd_##type##_predictor_##size##x##size##_c( \
-      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
-      const uint16_t *left, int bd) { \
+#define intra_pred_highbd_sized(type, size)                        \
+  void vpx_highbd_##type##_predictor_##size##x##size##_c(          \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,      \
+      const uint16_t *left, int bd) {                              \
     highbd_##type##_predictor(dst, stride, size, above, left, bd); \
   }
 
+/* clang-format off */
 #define intra_pred_allsizes(type) \
   intra_pred_sized(type, 4) \
   intra_pred_sized(type, 8) \
@@ -855,4 +835,5 @@ intra_pred_allsizes(dc_128)
 intra_pred_allsizes(dc_left)
 intra_pred_allsizes(dc_top)
 intra_pred_allsizes(dc)
+/* clang-format on */
 #undef intra_pred_allsizes
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 707cb92bbb2dcea4ff3a4f4e2d70207907a013f8..d5be32e7db927c5800bda2f77f69a2d96ec1e2cb 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -15,8 +15,8 @@
 #include "vpx_dsp/inv_txfm.h"
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-   0.5 shifts per pixel. */
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
   int i;
   tran_low_t output[16];
   tran_high_t a1, b1, c1, d1, e1;
@@ -127,8 +127,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 
   // Columns
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
+    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
     idct4_c(temp_in, temp_out);
     for (j = 0; j < 4; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -223,8 +222,7 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 
   // Then transform columns
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     idct8_c(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -240,8 +238,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i)
-      dest[i] = clip_pixel_add(dest[i], a1);
+    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
     dest += stride;
   }
 }
@@ -296,20 +293,20 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x7 = input[6];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = 0;
+    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+        output[6] = output[7] = 0;
     return;
   }
 
   // stage 1
-  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
+  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
-  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
+  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
 
   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
@@ -376,8 +373,7 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 
   // Then transform columns
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     idct8_c(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -391,22 +387,22 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp1, temp2;
 
   // stage 1
-  step1[0] = input[0/2];
-  step1[1] = input[16/2];
-  step1[2] = input[8/2];
-  step1[3] = input[24/2];
-  step1[4] = input[4/2];
-  step1[5] = input[20/2];
-  step1[6] = input[12/2];
-  step1[7] = input[28/2];
-  step1[8] = input[2/2];
-  step1[9] = input[18/2];
-  step1[10] = input[10/2];
-  step1[11] = input[26/2];
-  step1[12] = input[6/2];
-  step1[13] = input[22/2];
-  step1[14] = input[14/2];
-  step1[15] = input[30/2];
+  step1[0] = input[0 / 2];
+  step1[1] = input[16 / 2];
+  step1[2] = input[8 / 2];
+  step1[3] = input[24 / 2];
+  step1[4] = input[4 / 2];
+  step1[5] = input[20 / 2];
+  step1[6] = input[12 / 2];
+  step1[7] = input[28 / 2];
+  step1[8] = input[2 / 2];
+  step1[9] = input[18 / 2];
+  step1[10] = input[10 / 2];
+  step1[11] = input[26 / 2];
+  step1[12] = input[6 / 2];
+  step1[13] = input[22 / 2];
+  step1[14] = input[14 / 2];
+  step1[15] = input[30 / 2];
 
   // stage 2
   step2[0] = step1[0];
@@ -567,8 +563,7 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
 
   // Then transform columns
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     idct16_c(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -598,21 +593,20 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x14 = input[1];
   tran_high_t x15 = input[14];
 
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = output[8]
-              = output[9] = output[10] = output[11] = output[12]
-              = output[13] = output[14] = output[15] = 0;
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+        x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+        output[6] = output[7] = output[8] = output[9] = output[10] =
+            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
     return;
   }
 
   // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
@@ -621,9 +615,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
@@ -651,14 +645,14 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s5 = x5;
   s6 = x6;
   s7 = x7;
-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
   x0 = WRAPLOW(s0 + s4);
   x1 = WRAPLOW(s1 + s5);
@@ -682,18 +676,18 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
   s8 = x8;
   s9 = x9;
   s10 = x10;
   s11 = x11;
-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
   x0 = WRAPLOW(s0 + s2);
   x1 = WRAPLOW(s1 + s3);
@@ -713,13 +707,13 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 
   // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
+  s2 = (-cospi_16_64) * (x2 + x3);
   s3 = cospi_16_64 * (x2 - x3);
   s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (- x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
   s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (- x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
   x2 = WRAPLOW(dct_const_round_shift(s2));
@@ -766,8 +760,7 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 
   // Then transform columns
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j*16 + i];
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     idct16_c(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -783,8 +776,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i)
-      dest[i] = clip_pixel_add(dest[i], a1);
+    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
     dest += stride;
   }
 }
@@ -1166,8 +1158,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
   // Rows
   for (i = 0; i < 32; ++i) {
     int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j)
-      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
     for (j = 0; j < 8; ++j)
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
     for (j = 0; j < 4; ++j)
@@ -1185,8 +1176,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
     idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -1197,7 +1187,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
-  tran_low_t out[32 * 32] = {0};
+  tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
   int i, j;
   tran_low_t temp_in[32], temp_out[32];
@@ -1212,8 +1202,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
     idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -1224,7 +1213,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
-  tran_low_t out[32 * 32] = {0};
+  tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
   int i, j;
   tran_low_t temp_in[32], temp_out[32];
@@ -1239,8 +1228,7 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
     idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
@@ -1258,8 +1246,7 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i)
-      dest[i] = clip_pixel_add(dest[i], a1);
+    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
     dest += stride;
   }
 }
@@ -1309,14 +1296,14 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
-                                             HIGHBD_WRAPLOW(a1, bd), bd);
-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
-                                             HIGHBD_WRAPLOW(b1, bd), bd);
-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
-                                             HIGHBD_WRAPLOW(c1, bd), bd);
-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
-                                             HIGHBD_WRAPLOW(d1, bd), bd);
+    dest[stride * 0] =
+        highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
+    dest[stride * 1] =
+        highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
+    dest[stride * 2] =
+        highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
+    dest[stride * 3] =
+        highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
 
     ip++;
     dest++;
@@ -1331,7 +1318,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   const tran_low_t *ip = in;
   tran_low_t *op = tmp;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  (void) bd;
+  (void)bd;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
@@ -1343,14 +1330,14 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   for (i = 0; i < 4; i++) {
     e1 = ip[0] >> 1;
     a1 = ip[0] - e1;
-    dest[dest_stride * 0] = highbd_clip_pixel_add(
-        dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] = highbd_clip_pixel_add(
-        dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] = highbd_clip_pixel_add(
-        dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] = highbd_clip_pixel_add(
-        dest[dest_stride * 3], e1, bd);
+    dest[dest_stride * 0] =
+        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] =
+        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] =
+        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] =
+        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
     ip++;
     dest++;
   }
@@ -1359,7 +1346,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
-  (void) bd;
+  (void)bd;
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -1394,8 +1381,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // Columns
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
+    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
     vpx_highbd_idct4_c(temp_in, temp_out, bd);
     for (j = 0; j < 4; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1408,8 +1394,8 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out =
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
@@ -1486,8 +1472,7 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // Then transform columns.
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1500,14 +1485,13 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out =
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i)
-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
     dest += stride;
   }
 }
@@ -1519,7 +1503,7 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t x1 = input[1];
   tran_low_t x2 = input[2];
   tran_low_t x3 = input[3];
-  (void) bd;
+  (void)bd;
 
   if (!(x0 | x1 | x2 | x3)) {
     memset(output, 0, 4 * sizeof(*output));
@@ -1561,7 +1545,7 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t x5 = input[4];
   tran_low_t x6 = input[1];
   tran_low_t x7 = input[6];
-  (void) bd;
+  (void)bd;
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
     memset(output, 0, 8 * sizeof(*output));
@@ -1569,14 +1553,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   }
 
   // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
 
   x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
   x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
@@ -1592,10 +1576,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
 
   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1644,8 +1628,7 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
   }
   // Then transform columns.
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1657,25 +1640,25 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
-  (void) bd;
+  (void)bd;
 
   // stage 1
-  step1[0] = input[0/2];
-  step1[1] = input[16/2];
-  step1[2] = input[8/2];
-  step1[3] = input[24/2];
-  step1[4] = input[4/2];
-  step1[5] = input[20/2];
-  step1[6] = input[12/2];
-  step1[7] = input[28/2];
-  step1[8] = input[2/2];
-  step1[9] = input[18/2];
-  step1[10] = input[10/2];
-  step1[11] = input[26/2];
-  step1[12] = input[6/2];
-  step1[13] = input[22/2];
-  step1[14] = input[14/2];
-  step1[15] = input[30/2];
+  step1[0] = input[0 / 2];
+  step1[1] = input[16 / 2];
+  step1[2] = input[8 / 2];
+  step1[3] = input[24 / 2];
+  step1[4] = input[4 / 2];
+  step1[5] = input[20 / 2];
+  step1[6] = input[12 / 2];
+  step1[7] = input[28 / 2];
+  step1[8] = input[2 / 2];
+  step1[9] = input[18 / 2];
+  step1[10] = input[10 / 2];
+  step1[11] = input[26 / 2];
+  step1[12] = input[6 / 2];
+  step1[13] = input[22 / 2];
+  step1[14] = input[14 / 2];
+  step1[15] = input[30 / 2];
 
   // stage 2
   step2[0] = step1[0];
@@ -1837,8 +1820,7 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // Then transform columns.
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
@@ -1867,20 +1849,20 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t x13 = input[12];
   tran_low_t x14 = input[1];
   tran_low_t x15 = input[14];
-  (void) bd;
+  (void)bd;
 
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+        x13 | x14 | x15)) {
     memset(output, 0, 16 * sizeof(*output));
     return;
   }
 
   // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
@@ -1889,9 +1871,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
   x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
   x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
@@ -1901,8 +1883,8 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
   x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
   x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
-  x8  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
-  x9  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
   x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
   x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
   x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
@@ -1981,13 +1963,13 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
+  s2 = (-cospi_16_64) * (x2 + x3);
   s3 = cospi_16_64 * (x2 - x3);
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (-x6 + x7);
   s10 = cospi_16_64 * (x10 + x11);
   s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
+  s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
   x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
@@ -2035,8 +2017,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // Then transform columns.
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j*16 + i];
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
@@ -2049,24 +2030,22 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out =
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i)
-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
     dest += stride;
   }
 }
 
-void vpx_highbd_idct32_c(const tran_low_t *input,
-                         tran_low_t *output, int bd) {
+void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
-  (void) bd;
+  (void)bd;
 
   // stage 1
   step1[0] = input[0];
@@ -2442,8 +2421,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_low_t zero_coeff[16];
-    for (j = 0; j < 16; ++j)
-      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
     for (j = 0; j < 8; ++j)
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
     for (j = 0; j < 4; ++j)
@@ -2461,8 +2439,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
     vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
@@ -2473,7 +2450,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
 
 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
-  tran_low_t out[32 * 32] = {0};
+  tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
   int i, j;
   tran_low_t temp_in[32], temp_out[32];
@@ -2488,8 +2465,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
   }
   // Columns
   for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
     vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
@@ -2504,14 +2480,13 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = HIGHBD_WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out =
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i)
-      dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
     dest += stride;
   }
 }
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 0c4359c270831d0cad66394ed2f1ec57705ff384..0f31a79b5d1d6ad0608fa2f0c0bb2be61072e95e 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -41,8 +41,7 @@ static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_high_t highbd_check_range(tran_high_t input,
-                                             int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid highbitdepth VP9 streams, intermediate stage coefficients will
   // stay within the ranges:
@@ -53,9 +52,9 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input,
   const int32_t int_min = -int_max - 1;
   assert(int_min <= input);
   assert(input <= int_max);
-  (void) int_min;
+  (void)int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  (void) bd;
+  (void)bd;
   return input;
 }
 
@@ -86,15 +85,14 @@ static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
 #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
 #if CONFIG_VP9_HIGHBITDEPTH
 #define HIGHBD_WRAPLOW(x, bd) \
-    ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+  ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#else   // CONFIG_EMULATE_HARDWARE
+#else  // CONFIG_EMULATE_HARDWARE
 
 #define WRAPLOW(x) ((int32_t)check_range(x))
 #if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
-    ((int32_t)highbd_check_range((x), bd))
+#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EMULATE_HARDWARE
 
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 645a1ab95ee996085af04c2391d723446f9ce7a4..40f02b46d93aaf4e3bd395d73225ea82c2097634 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -22,23 +22,18 @@ static INLINE int8_t signed_char_clamp(int t) {
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
   switch (bd) {
-    case 10:
-      return (int16_t)clamp(t, -128*4, 128*4-1);
-    case 12:
-      return (int16_t)clamp(t, -128*16, 128*16-1);
+    case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+    case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
     case 8:
-    default:
-      return (int16_t)clamp(t, -128, 128-1);
+    default: return (int16_t)clamp(t, -128, 128 - 1);
   }
 }
 #endif
 
 // should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
-                                 uint8_t p3, uint8_t p2,
-                                 uint8_t p1, uint8_t p0,
-                                 uint8_t q0, uint8_t q1,
-                                 uint8_t q2, uint8_t q3) {
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+                                 uint8_t q1, uint8_t q2, uint8_t q3) {
   int8_t mask = 0;
   mask |= (abs(p3 - p2) > limit) * -1;
   mask |= (abs(p2 - p1) > limit) * -1;
@@ -46,14 +41,12 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
   mask |= (abs(q1 - q0) > limit) * -1;
   mask |= (abs(q2 - q1) > limit) * -1;
   mask |= (abs(q3 - q2) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
   return ~mask;
 }
 
-static INLINE int8_t flat_mask4(uint8_t thresh,
-                                uint8_t p3, uint8_t p2,
-                                uint8_t p1, uint8_t p0,
-                                uint8_t q0, uint8_t q1,
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
                                 uint8_t q2, uint8_t q3) {
   int8_t mask = 0;
   mask |= (abs(p1 - p0) > thresh) * -1;
@@ -65,12 +58,10 @@ static INLINE int8_t flat_mask4(uint8_t thresh,
   return ~mask;
 }
 
-static INLINE int8_t flat_mask5(uint8_t thresh,
-                                uint8_t p4, uint8_t p3,
-                                uint8_t p2, uint8_t p1,
-                                uint8_t p0, uint8_t q0,
-                                uint8_t q1, uint8_t q2,
-                                uint8_t q3, uint8_t q4) {
+static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
+                                uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+                                uint8_t q1, uint8_t q2, uint8_t q3,
+                                uint8_t q4) {
   int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
   mask |= (abs(p4 - p0) > thresh) * -1;
   mask |= (abs(q4 - q0) > thresh) * -1;
@@ -81,8 +72,8 @@ static INLINE int8_t flat_mask5(uint8_t thresh,
 static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
                               uint8_t q0, uint8_t q1) {
   int8_t hev = 0;
-  hev  |= (abs(p1 - p0) > thresh) * -1;
-  hev  |= (abs(q1 - q0) > thresh) * -1;
+  hev |= (abs(p1 - p0) > thresh) * -1;
+  hev |= (abs(q1 - q0) > thresh) * -1;
   return hev;
 }
 
@@ -90,10 +81,10 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
   int8_t filter1, filter2;
 
-  const int8_t ps1 = (int8_t) *op1 ^ 0x80;
-  const int8_t ps0 = (int8_t) *op0 ^ 0x80;
-  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
-  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
+  const int8_t ps1 = (int8_t)*op1 ^ 0x80;
+  const int8_t ps0 = (int8_t)*op0 ^ 0x80;
+  const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
+  const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
   const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
 
   // add outer taps if we have high edge variance
@@ -127,9 +118,9 @@ void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     ++s;
   }
@@ -151,9 +142,9 @@ void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     s += pitch;
   }
@@ -168,9 +159,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
 }
 
 static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
-                           uint8_t *op3, uint8_t *op2,
-                           uint8_t *op1, uint8_t *op0,
-                           uint8_t *oq0, uint8_t *oq1,
+                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3) {
   if (flat && mask) {
     const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@@ -184,7 +174,7 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
   } else {
-    filter4(mask, thresh, op1,  op0, oq0, oq1);
+    filter4(mask, thresh, op1, op0, oq0, oq1);
   }
 }
 
@@ -198,11 +188,11 @@ void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                                 s,         s + 1 * p, s + 2 * p, s + 3 * p);
+    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+            s + 1 * p, s + 2 * p, s + 3 * p);
     ++s;
   }
 }
@@ -222,11 +212,11 @@ void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
   for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1,
-                                 s,     s + 1, s + 2, s + 3);
+    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+            s + 3);
     s += pitch;
   }
 }
@@ -239,52 +229,55 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter16(int8_t mask, uint8_t thresh,
-                            uint8_t flat, uint8_t flat2,
-                            uint8_t *op7, uint8_t *op6,
-                            uint8_t *op5, uint8_t *op4,
-                            uint8_t *op3, uint8_t *op2,
-                            uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1,
-                            uint8_t *oq2, uint8_t *oq3,
-                            uint8_t *oq4, uint8_t *oq5,
+static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
+                            uint8_t flat2, uint8_t *op7, uint8_t *op6,
+                            uint8_t *op5, uint8_t *op4, uint8_t *op3,
+                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
+                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
+                            uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
                             uint8_t *oq6, uint8_t *oq7) {
   if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
-                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
+                  p2 = *op2, p1 = *op1, p0 = *op0;
 
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
-                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+                  q5 = *oq5, q6 = *oq6, q7 = *oq7;
 
     // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4, 4);
+    *op6 = ROUND_POWER_OF_TWO(
+        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(
+        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+        4);
     *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5, 4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
-    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
-    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+                                  q0 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+                                  q1 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+                                  q2 + q3 + q4 + q5 + q6 + q7,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+                                  q3 + q4 + q5 + q6 + q7 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(
+        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
   } else {
     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
@@ -300,18 +293,17 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
   for (i = 0; i < 8 * count; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1,
-                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
-                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
-    filter16(mask, *thresh, flat, flat2,
-             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-             s,         s + 1 * p, s + 2 * p, s + 3 * p,
-             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+    const int8_t flat2 =
+        flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
+                   s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
+
+    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
+             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
+             s + 7 * p);
     ++s;
   }
 }
@@ -326,25 +318,23 @@ void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
 }
 
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
-                                   const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh,
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
   int i;
 
   for (i = 0; i < count; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
-    const int8_t mask = filter_mask(*limit, *blimit,
-                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                    q0, s[4], s[5], s[6], s[7]);
+    const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4],
+                                    s[5], s[6], s[7]);
 
-    filter16(mask, *thresh, flat, flat2,
-             s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
+    filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
+             s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
+             s + 7);
     s += p;
   }
 }
@@ -362,9 +352,8 @@ void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
 #if CONFIG_VP9_HIGHBITDEPTH
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
-                                        uint16_t p3, uint16_t p2,
-                                        uint16_t p1, uint16_t p0,
-                                        uint16_t q0, uint16_t q1,
+                                        uint16_t p3, uint16_t p2, uint16_t p1,
+                                        uint16_t p0, uint16_t q0, uint16_t q1,
                                         uint16_t q2, uint16_t q3, int bd) {
   int8_t mask = 0;
   int16_t limit16 = (uint16_t)limit << (bd - 8);
@@ -375,15 +364,14 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
   mask |= (abs(q1 - q0) > limit16) * -1;
   mask |= (abs(q2 - q1) > limit16) * -1;
   mask |= (abs(q3 - q2) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
   return ~mask;
 }
 
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
-                                       uint16_t p3, uint16_t p2,
-                                       uint16_t p1, uint16_t p0,
-                                       uint16_t q0, uint16_t q1,
-                                       uint16_t q2, uint16_t q3, int bd) {
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+                                       uint16_t p1, uint16_t p0, uint16_t q0,
+                                       uint16_t q1, uint16_t q2, uint16_t q3,
+                                       int bd) {
   int8_t mask = 0;
   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
   mask |= (abs(p1 - p0) > thresh16) * -1;
@@ -395,11 +383,9 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh,
   return ~mask;
 }
 
-static INLINE int8_t highbd_flat_mask5(uint8_t thresh,
-                                       uint16_t p4, uint16_t p3,
-                                       uint16_t p2, uint16_t p1,
-                                       uint16_t p0, uint16_t q0,
-                                       uint16_t q1, uint16_t q2,
+static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
+                                       uint16_t p2, uint16_t p1, uint16_t p0,
+                                       uint16_t q0, uint16_t q1, uint16_t q2,
                                        uint16_t q3, uint16_t q4, int bd) {
   int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
@@ -470,21 +456,17 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
     const uint16_t q1 = s[1 * p];
     const uint16_t q2 = s[2 * p];
     const uint16_t q3 = s[3 * p];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
     ++s;
   }
 }
 
-void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
-                                        const uint8_t *blimit0,
-                                        const uint8_t *limit0,
-                                        const uint8_t *thresh0,
-                                        const uint8_t *blimit1,
-                                        const uint8_t *limit1,
-                                        const uint8_t *thresh1,
-                                        int bd) {
+void vpx_highbd_lpf_horizontal_4_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
   vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
   vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
 }
@@ -498,31 +480,26 @@ void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
     s += pitch;
   }
 }
 
-void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
-                                      const uint8_t *blimit0,
-                                      const uint8_t *limit0,
-                                      const uint8_t *thresh0,
-                                      const uint8_t *blimit1,
-                                      const uint8_t *limit1,
-                                      const uint8_t *thresh1,
-                                      int bd) {
+void vpx_highbd_lpf_vertical_4_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
   vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, bd);
+  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
 }
 
 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
-                                  uint16_t *op3, uint16_t *op2,
-                                  uint16_t *op1, uint16_t *op0,
-                                  uint16_t *oq0, uint16_t *oq1,
+                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3, int bd) {
   if (flat && mask) {
     const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
@@ -536,7 +513,7 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
   } else {
-    highbd_filter4(mask, thresh, op1,  op0, oq0, oq1, bd);
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
   }
 }
 
@@ -551,25 +528,20 @@ void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
     const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                         p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
-    highbd_filter8(mask, *thresh, flat,
-                 s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                 s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
+                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
     ++s;
   }
 }
 
-void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
-                                        const uint8_t *blimit0,
-                                        const uint8_t *limit0,
-                                        const uint8_t *thresh0,
-                                        const uint8_t *blimit1,
-                                        const uint8_t *limit1,
-                                        const uint8_t *thresh1,
-                                        int bd) {
+void vpx_highbd_lpf_horizontal_8_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
   vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
   vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
 }
@@ -582,40 +554,31 @@ void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
   for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
-    highbd_filter8(mask, *thresh, flat,
-                 s - 4, s - 3, s - 2, s - 1,
-                 s, s + 1, s + 2, s + 3,
-                 bd);
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+                   s + 2, s + 3, bd);
     s += pitch;
   }
 }
 
-void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
-                                      const uint8_t *blimit0,
-                                      const uint8_t *limit0,
-                                      const uint8_t *thresh0,
-                                      const uint8_t *blimit1,
-                                      const uint8_t *limit1,
-                                      const uint8_t *thresh1,
-                                      int bd) {
+void vpx_highbd_lpf_vertical_8_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
   vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, bd);
-}
-
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
-                                   uint8_t flat, uint8_t flat2,
-                                   uint16_t *op7, uint16_t *op6,
-                                   uint16_t *op5, uint16_t *op4,
-                                   uint16_t *op3, uint16_t *op2,
-                                   uint16_t *op1, uint16_t *op0,
-                                   uint16_t *oq0, uint16_t *oq1,
-                                   uint16_t *oq2, uint16_t *oq3,
-                                   uint16_t *oq4, uint16_t *oq5,
+  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
+                                   uint8_t flat2, uint16_t *op7, uint16_t *op6,
+                                   uint16_t *op5, uint16_t *op4, uint16_t *op3,
+                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
+                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+                                   uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
                                    uint16_t *oq6, uint16_t *oq7, int bd) {
   if (flat2 && flat && mask) {
     const uint16_t p7 = *op7;
@@ -636,34 +599,40 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
     const uint16_t q7 = *oq7;
 
     // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4, 4);
+    *op6 = ROUND_POWER_OF_TWO(
+        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(
+        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
+        4);
     *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5, 4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
-    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
-    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(p0 +
-                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
+                                  q0 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
+                                  q1 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
+                                  q2 + q3 + q4 + q5 + q6 + q7,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
+                                  q3 + q4 + q5 + q6 + q7 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(
+        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
   } else {
     highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
                    bd);
@@ -673,8 +642,8 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
 static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                                             const uint8_t *blimit,
                                             const uint8_t *limit,
-                                            const uint8_t *thresh,
-                                            int count, int bd) {
+                                            const uint8_t *thresh, int count,
+                                            int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -688,20 +657,18 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
     const uint16_t q1 = s[1 * p];
     const uint16_t q2 = s[2 * p];
     const uint16_t q3 = s[3 * p];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
-    const int8_t flat2 = highbd_flat_mask5(
-        1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
-        q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2,
-                    s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
-                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
-                    s, s + 1 * p, s + 2 * p, s + 3 * p,
-                    s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p,
-                    bd);
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat2 =
+        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
+                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
+                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
+                    s + 6 * p, s + 7 * p, bd);
     ++s;
   }
 }
@@ -723,8 +690,8 @@ void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
-                                          const uint8_t *thresh,
-                                          int count, int bd) {
+                                          const uint8_t *thresh, int count,
+                                          int bd) {
   int i;
 
   for (i = 0; i < count; ++i) {
@@ -736,17 +703,16 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
     const uint16_t q1 = s[1];
     const uint16_t q2 = s[2];
     const uint16_t q3 = s[3];
-    const int8_t mask = highbd_filter_mask(*limit, *blimit,
-                                           p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3,
-                                          bd);
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
                                            q0, s[4], s[5], s[6], s[7], bd);
 
-    highbd_filter16(mask, *thresh, flat, flat2,
-                    s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
-                    s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7,
-                    bd);
+    highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
+                    s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
+                    s + 5, s + 6, s + 7, bd);
     s += p;
   }
 }
@@ -760,8 +726,7 @@ void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
 void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
-                                       const uint8_t *thresh,
-                                       int bd) {
+                                       const uint8_t *thresh, int bd) {
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/mips/common_dspr2.h b/vpx_dsp/mips/common_dspr2.h
index 7a10bf1c4050719bec76e0fef47d88d47ce86f01..0a42f5cec21f88c1564a228d45e9dd2ee2729d07 100644
--- a/vpx_dsp/mips/common_dspr2.h
+++ b/vpx_dsp/mips/common_dspr2.h
@@ -24,37 +24,21 @@ extern "C" {
 extern uint8_t *vpx_ff_cropTbl;  // From "vpx_dsp/mips/intrapred4_dspr2.c"
 
 static INLINE void prefetch_load(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   0,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
+  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
 }
 
 /* prefetch data for store */
 static INLINE void prefetch_store(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   1,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
+  __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
 }
 
 static INLINE void prefetch_load_streamed(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   4,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
+  __asm__ __volatile__("pref   4,  0(%[src])   \n\t" : : [src] "r"(src));
 }
 
 /* prefetch data for store */
 static INLINE void prefetch_store_streamed(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   5,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
+  __asm__ __volatile__("pref   5,  0(%[dst])   \n\t" : : [dst] "r"(dst));
 }
 #endif  // #if HAVE_DSPR2
 #ifdef __cplusplus
diff --git a/vpx_dsp/mips/convolve2_avg_dspr2.c b/vpx_dsp/mips/convolve2_avg_dspr2.c
index 3c767672fbd23afdae79668dc6f08c1c20ed34a9..ae88eddfd61dcf38c7be78e3c7fc85120b94bd45 100644
--- a/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -18,25 +18,22 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
-                                         int32_t src_stride,
-                                         uint8_t *dst,
-                                         int32_t dst_stride,
-                                         const int16_t *filter_y,
-                                         int32_t w,
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                         uint8_t *dst, int32_t dst_stride,
+                                         const int16_t *filter_y, int32_t w,
                                          int32_t h) {
-  int32_t       x, y;
+  int32_t x, y;
   const uint8_t *src_ptr;
-  uint8_t       *dst_ptr;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector4a = 64;
-  uint32_t      load1, load2;
-  uint32_t      p1, p2;
-  uint32_t      scratch1, scratch2;
-  uint32_t      store1, store2;
-  int32_t       Temp1, Temp2;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
   const int16_t *filter = &filter_y[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -48,7 +45,7 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -105,16 +102,13 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [p1] "=&r" (p1), [p2] "=&r" (p2),
-            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
-            [src_stride] "r" (src_stride), [cm] "r" (cm),
-            [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -124,23 +118,21 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
 }
 
 static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
-                                          int32_t src_stride,
-                                          uint8_t *dst,
+                                          int32_t src_stride, uint8_t *dst,
                                           int32_t dst_stride,
-                                          const int16_t *filter_y,
-                                          int32_t h) {
-  int32_t       x, y;
+                                          const int16_t *filter_y, int32_t h) {
+  int32_t x, y;
   const uint8_t *src_ptr;
-  uint8_t       *dst_ptr;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector4a = 64;
-  uint32_t      load1, load2;
-  uint32_t      p1, p2;
-  uint32_t      scratch1, scratch2;
-  uint32_t      store1, store2;
-  int32_t       Temp1, Temp2;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
   const int16_t *filter = &filter_y[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -153,7 +145,7 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -210,16 +202,13 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [p1] "=&r" (p1), [p2] "=&r" (p2),
-            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
-            [src_stride] "r" (src_stride), [cm] "r" (cm),
-            [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -231,18 +220,16 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
-                                  int w, int h) {
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   prefetch_store(dst);
 
@@ -251,22 +238,17 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
     case 8:
     case 16:
     case 32:
-      convolve_bi_avg_vert_4_dspr2(src, src_stride,
-                                   dst, dst_stride,
-                                   filter_y, w, h);
+      convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
+                                   w, h);
       break;
     case 64:
       prefetch_store(dst + 32);
-      convolve_bi_avg_vert_64_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_y, h);
+      convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+                                    h);
       break;
     default:
-      vpx_convolve8_avg_vert_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
+      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                               x_step_q4, filter_y, y_step_q4, w, h);
       break;
   }
 }
diff --git a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
index 932a73d39b17beda50b0ac1e59fafb62d5cf0a2b..e944207b6ee79761320e76e7360d7788a0696095 100644
--- a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -19,20 +19,18 @@
 
 #if HAVE_DSPR2
 static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
-                                          int32_t src_stride,
-                                          uint8_t *dst,
+                                          int32_t src_stride, uint8_t *dst,
                                           int32_t dst_stride,
-                                          const int16_t *filter_x0,
-                                          int32_t h) {
+                                          const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
-  int32_t  Temp1, Temp2, Temp3, Temp4;
+  int32_t Temp1, Temp2, Temp3, Temp4;
   uint32_t vector4a = 64;
   uint32_t tp1, tp2;
   uint32_t p1, p2, p3;
   uint32_t tn1, tn2;
   const int16_t *filter = &filter_x0[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -42,7 +40,7 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],         0(%[src])                      \n\t"
         "ulw              %[tp2],         4(%[src])                      \n\t"
 
@@ -61,51 +59,49 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
         "extp             %[Temp3],       $ac2,           31             \n\t"
 
-        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
 
         /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
         "mtlo             %[vector4a],    $ac3                           \n\t"
         "mthi             $zero,          $ac3                           \n\t"
-        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
         "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
         "extp             %[Temp2],       $ac3,           31             \n\t"
 
-        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
 
         /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
         "mtlo             %[vector4a],    $ac2                           \n\t"
         "mthi             $zero,          $ac2                           \n\t"
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
-        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
         "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
         "extp             %[Temp4],       $ac2,           31             \n\t"
 
-        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
-        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
 
         /* clamp */
-        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
-        "lbux             %[p3],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
-        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
+        "lbux             %[p3],          %[Temp4](%[cm])                \n\t" /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
 
-        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
-        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
 
-        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t"  /* average odd 2 */
-        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
+        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t" /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
-        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+          [Temp4] "=&r"(Temp4)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -114,11 +110,9 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
 }
 
 static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
-                                         int32_t src_stride,
-                                         uint8_t *dst,
-                                         int32_t dst_stride,
-                                         const int16_t *filter_x0,
-                                         int32_t h) {
+                                          int32_t src_stride, uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
@@ -127,7 +121,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
   uint32_t p1, p2, p3, p4, n1;
   uint32_t st0, st1;
   const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -137,7 +131,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],         0(%[src])                      \n\t"
         "ulw              %[tp2],         4(%[src])                      \n\t"
 
@@ -246,15 +240,12 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
         "sb               %[tp4],         5(%[dst])                      \n\t"
         "sb               %[tp1],         7(%[dst])                      \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
-          [st0] "=&r" (st0), [st1] "=&r" (st1),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [n1] "=&r" (n1),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+          [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
+          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -263,12 +254,10 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
 }
 
 static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
-                                          int32_t src_stride,
-                                          uint8_t *dst_ptr,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_x0,
-                                          int32_t h,
-                                          int32_t count) {
+                                           int32_t src_stride, uint8_t *dst_ptr,
+                                           int32_t dst_stride,
+                                           const int16_t *filter_x0, int32_t h,
+                                           int32_t count) {
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
@@ -279,7 +268,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
   uint32_t p1, p2, p3, p4, p5;
   uint32_t st1, st2, st3;
   const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -293,7 +282,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -493,14 +482,13 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+            [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -513,11 +501,10 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 }
 
 static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
-                                          int32_t src_stride,
-                                          uint8_t *dst_ptr,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_x0,
-                                          int32_t h) {
+                                           int32_t src_stride, uint8_t *dst_ptr,
+                                           int32_t dst_stride,
+                                           const int16_t *filter_x0,
+                                           int32_t h) {
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
@@ -528,7 +515,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
   uint32_t p1, p2, p3, p4, p5;
   uint32_t st1, st2, st3;
   const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -544,7 +531,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -744,14 +731,13 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+            [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -773,11 +759,9 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 == 16);
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   /* prefetch data to cache memory */
   prefetch_load(src);
@@ -786,39 +770,31 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
   switch (w) {
     case 4:
-      convolve_bi_avg_horiz_4_dspr2(src, src_stride,
-                                   dst, dst_stride,
-                                   filter_x, h);
+      convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                    h);
       break;
     case 8:
-      convolve_bi_avg_horiz_8_dspr2(src, src_stride,
-                                   dst, dst_stride,
-                                   filter_x, h);
+      convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                    h);
       break;
     case 16:
-      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_x, h, 1);
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                     h, 1);
       break;
     case 32:
-      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_x, h, 2);
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                     h, 2);
       break;
     case 64:
       prefetch_load(src + 64);
       prefetch_store(dst + 32);
 
-      convolve_bi_avg_horiz_64_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_x, h);
+      convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                     h);
       break;
     default:
-      vpx_convolve8_avg_horiz_c(src, src_stride,
-                                dst, dst_stride,
-                                filter_x, x_step_q4,
-                                filter_y, y_step_q4,
-                                w, h);
+      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                                x_step_q4, filter_y, y_step_q4, w, h);
       break;
   }
 }
diff --git a/vpx_dsp/mips/convolve2_dspr2.c b/vpx_dsp/mips/convolve2_dspr2.c
index d111029d42a59858dccc7329d3298f2876826f57..e355ba3a06cb24a498bbed89a63e7ee75eb780d1 100644
--- a/vpx_dsp/mips/convolve2_dspr2.c
+++ b/vpx_dsp/mips/convolve2_dspr2.c
@@ -18,21 +18,18 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int16_t *filter_x0,
-                                                 int32_t h) {
-  int32_t       y;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint8_t       *dst_ptr;
-  int32_t       Temp1, Temp2;
-  uint32_t      vector4a = 64;
-  uint32_t      tp1, tp2;
-  uint32_t      p1, p2;
+static void convolve_bi_horiz_4_transposed_dspr2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint8_t *dst_ptr;
+  int32_t Temp1, Temp2;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
   const int16_t *filter = &filter_x0[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -42,7 +39,7 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride);
     prefetch_load(src + src_stride + 32);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],         0(%[src])                      \n\t"
         "ulw              %[tp2],         4(%[src])                      \n\t"
 
@@ -94,13 +91,10 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [p1] "=&r" (p1), [p2] "=&r" (p2),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-          [dst_ptr] "+r" (dst_ptr)
-        : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
-          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [src] "r"(src), [dst_stride] "r"(dst_stride));
 
     /* Next row... */
     src += src_stride;
@@ -108,12 +102,9 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int16_t *filter_x0,
-                                                 int32_t h) {
+static void convolve_bi_horiz_8_transposed_dspr2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
   uint8_t *dst_ptr;
@@ -124,7 +115,7 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
   uint8_t *odd_dst;
   uint32_t dst_pitch_2 = (dst_stride << 1);
   const int16_t *filter = &filter_x0[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -136,7 +127,7 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],         0(%[src])                       \n\t"
         "ulw              %[tp2],         4(%[src])                       \n\t"
 
@@ -180,7 +171,8 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
         "extp             %[Temp3],       $ac2,           31              \n\t"
 
-        "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
+        "lbux             %[Temp1],         %[p3](%[cm])                    "
+        "\n\t"
 
         /* odd 1. pixel */
         "mtlo             %[vector4a],    $ac1                            \n\t"
@@ -231,13 +223,12 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
 
         "sb               %[p1],          0(%[odd_dst])                   \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
-        : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
-          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
+          [odd_dst] "+r"(odd_dst)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
 
     /* Next row... */
     src += src_stride;
@@ -245,26 +236,22 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst_ptr,
-                                                  int32_t dst_stride,
-                                                  const int16_t *filter_x0,
-                                                  int32_t h,
-                                                  int32_t count) {
-  int32_t       c, y;
+static void convolve_bi_horiz_16_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+  int32_t c, y;
   const uint8_t *src;
-  uint8_t       *dst;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector_64 = 64;
-  int32_t       Temp1, Temp2, Temp3;
-  uint32_t      qload1, qload2;
-  uint32_t      p1, p2, p3, p4, p5;
-  uint32_t      st1, st2, st3;
-  uint32_t      dst_pitch_2 = (dst_stride << 1);
-  uint8_t       *odd_dst;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t *odd_dst;
   const int16_t *filter = &filter_x0[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -279,193 +266,329 @@ static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
     odd_dst = (dst + dst_stride);
 
     for (c = 0; c < count; c++) {
-      __asm__ __volatile__ (
-          "ulw              %[qload1],        0(%[src])                       \n\t"
-          "ulw              %[qload2],        4(%[src])                       \n\t"
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
 
           /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "ulw              %[qload1],        8(%[src])                       \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
 
           /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
-          "ulw              %[qload2],        12(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
 
           /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
 
           /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
 
           /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
 
           /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        20(%[src])                      \n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
 
           /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
 
           /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
 
           /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       \n\t"
-          "ulw              %[qload2],        5(%[src])                       \n\t"
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
 
           /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        9(%[src])                       \n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
 
           /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
 
           /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
 
           /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
 
           /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
 
           /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        21(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
 
           /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
 
           /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
-          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
-            [cm] "r" (cm),
-            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
-      );
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
 
       src += 16;
       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -478,25 +601,22 @@ static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
   }
 }
 
-static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst_ptr,
-                                                  int32_t dst_stride,
-                                                  const int16_t *filter_x0,
-                                                  int32_t h) {
-  int32_t       c, y;
+static void convolve_bi_horiz_64_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+  int32_t c, y;
   const uint8_t *src;
-  uint8_t       *dst;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector_64 = 64;
-  int32_t       Temp1, Temp2, Temp3;
-  uint32_t      qload1, qload2;
-  uint32_t      p1, p2, p3, p4, p5;
-  uint32_t      st1, st2, st3;
-  uint32_t      dst_pitch_2 = (dst_stride << 1);
-  uint8_t       *odd_dst;
+  uint8_t *dst;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t *odd_dst;
   const int16_t *filter = &filter_x0[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -512,193 +632,329 @@ static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
     odd_dst = (dst + dst_stride);
 
     for (c = 0; c < 4; c++) {
-      __asm__ __volatile__ (
-          "ulw              %[qload1],        0(%[src])                       \n\t"
-          "ulw              %[qload2],        4(%[src])                       \n\t"
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
 
           /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "ulw              %[qload1],        8(%[src])                       \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
 
           /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
-          "ulw              %[qload2],        12(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
 
           /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
 
           /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
 
           /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
 
           /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        20(%[src])                      \n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
 
           /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
 
           /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
 
           /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       \n\t"
-          "ulw              %[qload2],        5(%[src])                       \n\t"
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
 
           /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        9(%[src])                       \n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
 
           /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
 
           /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
 
           /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
 
           /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
 
           /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        21(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
 
           /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
 
           /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
-          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
-            [cm] "r" (cm),
-            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
-      );
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
 
       src += 16;
       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -731,18 +987,15 @@ void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter,
-                         int w, int h) {
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter, int w,
+                         int h) {
   uint32_t pos = 38;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   /* prefetch data to cache memory */
   prefetch_load(src);
@@ -750,32 +1003,26 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
   switch (w) {
     case 4:
-      convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
-                                           dst, dst_stride,
+      convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
                                            filter, h);
       break;
     case 8:
-      convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
-                                           dst, dst_stride,
+      convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
                                            filter, h);
       break;
     case 16:
     case 32:
-      convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
-                                            dst, dst_stride,
-                                            filter, h,
-                                            (w/16));
+      convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                            filter, h, (w / 16));
       break;
     case 64:
       prefetch_load(src + 32);
-      convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
-                                            dst, dst_stride,
+      convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
                                             filter, h);
       break;
     default:
-      convolve_bi_horiz_transposed(src, src_stride,
-                                   dst, dst_stride,
-                                   filter, w, h);
+      convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
+                                   h);
       break;
   }
 }
diff --git a/vpx_dsp/mips/convolve2_horiz_dspr2.c b/vpx_dsp/mips/convolve2_horiz_dspr2.c
index 9fe1a3454b7e43fbdb0bf6175767eecd9b60e0c7..5cc06b5f26007da6b0d5ade8e11c1b4c8e0083fd 100644
--- a/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -18,12 +18,9 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
-                                      int32_t src_stride,
-                                      uint8_t *dst,
-                                      int32_t dst_stride,
-                                      const int16_t *filter_x0,
-                                      int32_t h) {
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
   int32_t Temp1, Temp2, Temp3, Temp4;
@@ -31,7 +28,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
   uint32_t tp1, tp2;
   uint32_t p1, p2;
   const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -41,7 +38,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],      0(%[src])                      \n\t"
         "ulw              %[tp2],      4(%[src])                      \n\t"
 
@@ -86,13 +83,11 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
         "sb               %[tp2],      2(%[dst])                      \n\t"
         "sb               %[p2],       3(%[dst])                      \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [p1] "=&r" (p1), [p2] "=&r" (p2),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
-        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+          [Temp4] "=&r"(Temp4)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -100,12 +95,9 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
-                                      int32_t src_stride,
-                                      uint8_t *dst,
-                                      int32_t dst_stride,
-                                      const int16_t *filter_x0,
-                                      int32_t h) {
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
@@ -114,7 +106,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
   uint32_t p1, p2, p3, p4;
   uint32_t st0, st1;
   const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -124,7 +116,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],      0(%[src])                      \n\t"
         "ulw              %[tp2],      4(%[src])                      \n\t"
 
@@ -210,13 +202,12 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
         "sb               %[p2],       5(%[dst])                      \n\t"
         "sb               %[p1],       7(%[dst])                      \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
-          [st0] "=&r" (st0), [st1] "=&r" (st1),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+          [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -225,11 +216,9 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
 }
 
 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       uint8_t *dst_ptr,
+                                       int32_t src_stride, uint8_t *dst_ptr,
                                        int32_t dst_stride,
-                                       const int16_t *filter_x0,
-                                       int32_t h,
+                                       const int16_t *filter_x0, int32_t h,
                                        int32_t count) {
   int32_t y, c;
   const uint8_t *src;
@@ -241,7 +230,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
   uint32_t p1, p2, p3, p4, p5;
   uint32_t st1, st2, st3;
   const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -255,7 +244,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -413,14 +402,13 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -433,11 +421,9 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
 }
 
 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       uint8_t *dst_ptr,
+                                       int32_t src_stride, uint8_t *dst_ptr,
                                        int32_t dst_stride,
-                                       const int16_t *filter_x0,
-                                       int32_t h) {
+                                       const int16_t *filter_x0, int32_t h) {
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
@@ -448,7 +434,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
   uint32_t p1, p2, p3, p4, p5;
   uint32_t st1, st2, st3;
   const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -464,7 +450,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -622,14 +608,13 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -644,8 +629,8 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
   uint32_t pos = 38;
 
   assert(x_step_q4 == 16);
@@ -653,11 +638,9 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   prefetch_load((const uint8_t *)filter_x);
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   /* prefetch data to cache memory */
   prefetch_load(src);
@@ -666,39 +649,31 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
   switch (w) {
     case 4:
-      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
-                                dst, (int32_t)dst_stride,
-                                filter_x, (int32_t)h);
+      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
       break;
     case 8:
-      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
-                                dst, (int32_t)dst_stride,
-                                filter_x, (int32_t)h);
+      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
       break;
     case 16:
-      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 filter_x, (int32_t)h, 1);
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
       break;
     case 32:
-      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 filter_x, (int32_t)h, 2);
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
       break;
     case 64:
       prefetch_load(src + 64);
       prefetch_store(dst + 32);
 
-      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 filter_x, (int32_t)h);
+      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h);
       break;
     default:
-      vpx_convolve8_horiz_c(src, src_stride,
-                            dst, dst_stride,
-                            filter_x, x_step_q4,
-                            filter_y, y_step_q4,
-                            w, h);
+      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
       break;
   }
 }
diff --git a/vpx_dsp/mips/convolve2_vert_dspr2.c b/vpx_dsp/mips/convolve2_vert_dspr2.c
index dde6ffd54f8182bad34f3128079da4d94f87a0b4..eb1975e4475ac4883ae0b73391595361b35000d7 100644
--- a/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -18,25 +18,22 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_bi_vert_4_dspr2(const uint8_t *src,
-                                     int32_t src_stride,
-                                     uint8_t *dst,
-                                     int32_t dst_stride,
-                                     const int16_t *filter_y,
-                                     int32_t w,
+static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int16_t *filter_y, int32_t w,
                                      int32_t h) {
-  int32_t       x, y;
+  int32_t x, y;
   const uint8_t *src_ptr;
-  uint8_t       *dst_ptr;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector4a = 64;
-  uint32_t      load1, load2;
-  uint32_t      p1, p2;
-  uint32_t      scratch1;
-  uint32_t      store1, store2;
-  int32_t       Temp1, Temp2;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
   const int16_t *filter = &filter_y[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -48,7 +45,7 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -98,16 +95,12 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [p1] "=&r" (p1), [p2] "=&r" (p2),
-            [scratch1] "=&r" (scratch1),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
-            [src_stride] "r" (src_stride),
-            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -116,24 +109,21 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_bi_vert_64_dspr2(const uint8_t *src,
-                                      int32_t src_stride,
-                                      uint8_t *dst,
-                                      int32_t dst_stride,
-                                      const int16_t *filter_y,
-                                      int32_t h) {
-  int32_t       x, y;
+static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_y, int32_t h) {
+  int32_t x, y;
   const uint8_t *src_ptr;
-  uint8_t       *dst_ptr;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector4a = 64;
-  uint32_t      load1, load2;
-  uint32_t      p1, p2;
-  uint32_t      scratch1;
-  uint32_t      store1, store2;
-  int32_t       Temp1, Temp2;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
   const int16_t *filter = &filter_y[3];
-  uint32_t      filter45;
+  uint32_t filter45;
 
   filter45 = ((const int32_t *)filter)[0];
 
@@ -145,7 +135,7 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -195,16 +185,12 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [p1] "=&r" (p1), [p2] "=&r" (p2),
-            [scratch1] "=&r" (scratch1),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
-            [src_stride] "r" (src_stride),
-            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -216,42 +202,34 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src,
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   prefetch_store(dst);
 
   switch (w) {
-    case 4 :
-    case 8 :
-    case 16 :
-    case 32 :
-      convolve_bi_vert_4_dspr2(src, src_stride,
-                               dst, dst_stride,
-                               filter_y, w, h);
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+      convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+                               h);
       break;
-    case 64 :
+    case 64:
       prefetch_store(dst + 32);
-      convolve_bi_vert_64_dspr2(src, src_stride,
-                                dst, dst_stride,
-                                filter_y, h);
+      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
       break;
     default:
-      vpx_convolve8_vert_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
+      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w, h);
       break;
   }
 }
diff --git a/vpx_dsp/mips/convolve8_avg_dspr2.c b/vpx_dsp/mips/convolve8_avg_dspr2.c
index 43da9e54fb2f7b88578d38a476c3660e6de7d884..31812299c34e1d17d909ed5b550f6aacf64ef97b 100644
--- a/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -18,25 +18,22 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_avg_vert_4_dspr2(const uint8_t *src,
-                                      int32_t src_stride,
-                                      uint8_t *dst,
-                                      int32_t dst_stride,
-                                      const int16_t *filter_y,
-                                      int32_t w,
+static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_y, int32_t w,
                                       int32_t h) {
-  int32_t       x, y;
+  int32_t x, y;
   const uint8_t *src_ptr;
-  uint8_t       *dst_ptr;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector4a = 64;
-  uint32_t      load1, load2, load3, load4;
-  uint32_t      p1, p2;
-  uint32_t      n1, n2;
-  uint32_t      scratch1, scratch2;
-  uint32_t      store1, store2;
-  int32_t       vector1b, vector2b, vector3b, vector4b;
-  int32_t       Temp1, Temp2;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
 
   vector1b = ((const int32_t *)filter_y)[0];
   vector2b = ((const int32_t *)filter_y)[1];
@@ -53,7 +50,7 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -160,18 +157,16 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [load3] "=&r" (load3), [load4] "=&r" (load4),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
-            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-            [vector4a] "r" (vector4a),
-            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -180,24 +175,21 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_avg_vert_64_dspr2(const uint8_t *src,
-                                       int32_t src_stride,
-                                       uint8_t *dst,
-                                       int32_t dst_stride,
-                                       const int16_t *filter_y,
-                                       int32_t h) {
-  int32_t       x, y;
+static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+                                       uint8_t *dst, int32_t dst_stride,
+                                       const int16_t *filter_y, int32_t h) {
+  int32_t x, y;
   const uint8_t *src_ptr;
-  uint8_t       *dst_ptr;
-  uint8_t       *cm = vpx_ff_cropTbl;
-  uint32_t      vector4a = 64;
-  uint32_t      load1, load2, load3, load4;
-  uint32_t      p1, p2;
-  uint32_t      n1, n2;
-  uint32_t      scratch1, scratch2;
-  uint32_t      store1, store2;
-  int32_t       vector1b, vector2b, vector3b, vector4b;
-  int32_t       Temp1, Temp2;
+  uint8_t *dst_ptr;
+  uint8_t *cm = vpx_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
 
   vector1b = ((const int32_t *)filter_y)[0];
   vector2b = ((const int32_t *)filter_y)[1];
@@ -215,7 +207,7 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -322,18 +314,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [load3] "=&r" (load3), [load4] "=&r" (load4),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
-            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-            [vector4a] "r" (vector4a),
-            [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -345,26 +335,21 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src,
 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
-                                  int w, int h) {
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
   if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_avg_vert_dspr2(src, src_stride,
-                                 dst, dst_stride,
-                                 filter_x, x_step_q4,
-                                 filter_y, y_step_q4,
-                                 w, h);
+    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
     /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
 
     prefetch_store(dst);
 
@@ -373,22 +358,17 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       case 8:
       case 16:
       case 32:
-        convolve_avg_vert_4_dspr2(src, src_stride,
-                                  dst, dst_stride,
-                                  filter_y, w, h);
+        convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+                                  h);
         break;
       case 64:
         prefetch_store(dst + 32);
-        convolve_avg_vert_64_dspr2(src, src_stride,
-                                   dst, dst_stride,
-                                   filter_y, h);
+        convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
+                                   h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride,
-                                 dst, dst_stride,
-                                 filter_x, x_step_q4,
-                                 filter_y, y_step_q4,
-                                 w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
@@ -397,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h) {
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
@@ -408,27 +388,20 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 == 16);
   assert(y_step_q4 == 16);
 
-  if (intermediate_height < h)
-    intermediate_height = h;
+  if (intermediate_height < h) intermediate_height = h;
 
-  vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
-                      temp, 64,
-                      filter_x, x_step_q4,
-                      filter_y, y_step_q4,
-                      w, intermediate_height);
+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
+                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
 
-  vpx_convolve8_avg_vert(temp + 64 * 3, 64,
-                         dst, dst_stride,
-                         filter_x, x_step_q4,
-                         filter_y, y_step_q4,
-                         w, h);
+  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                         x_step_q4, filter_y, y_step_q4, w, h);
 }
 
 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride,
-                            int w, int h) {
+                            const int16_t *filter_y, int filter_y_stride, int w,
+                            int h) {
   int x, y;
   uint32_t tp1, tp2, tn1;
   uint32_t tp3, tp4, tn2;
@@ -441,21 +414,19 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   switch (w) {
     case 4:
       /* 1 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         0(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
-            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
 
-            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
-              [tp2] "=&r" (tp2)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
@@ -463,26 +434,24 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       break;
     case 8:
       /* 2 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         0(%[dst])      \n\t"
             "ulw              %[tp3],         4(%[src])      \n\t"
             "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
-            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
+            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
 
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
-              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
@@ -490,34 +459,32 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       break;
     case 16:
       /* 4 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         0(%[dst])      \n\t"
             "ulw              %[tp3],         4(%[src])      \n\t"
             "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         8(%[src])      \n\t"
             "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
             "ulw              %[tp3],         12(%[src])     \n\t"
             "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
-            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
+            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
 
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
-              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
@@ -525,50 +492,48 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       break;
     case 32:
       /* 8 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         0(%[dst])      \n\t"
             "ulw              %[tp3],         4(%[src])      \n\t"
             "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         8(%[src])      \n\t"
             "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
             "ulw              %[tp3],         12(%[src])     \n\t"
             "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         16(%[src])     \n\t"
             "ulw              %[tp2],         16(%[dst])     \n\t"
-            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         20(%[src])     \n\t"
             "ulw              %[tp4],         20(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         24(%[src])     \n\t"
             "ulw              %[tp2],         24(%[dst])     \n\t"
-            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         28(%[src])     \n\t"
             "ulw              %[tp4],         28(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
-            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
+            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
 
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
-              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
@@ -579,84 +544,82 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       prefetch_store(dst + 32);
 
       /* 16 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_load(src + src_stride + 64);
         prefetch_store(dst + dst_stride);
         prefetch_store(dst + dst_stride + 32);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         0(%[dst])      \n\t"
             "ulw              %[tp3],         4(%[src])      \n\t"
             "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         8(%[src])      \n\t"
             "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
             "ulw              %[tp3],         12(%[src])     \n\t"
             "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         16(%[src])     \n\t"
             "ulw              %[tp2],         16(%[dst])     \n\t"
-            "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         20(%[src])     \n\t"
             "ulw              %[tp4],         20(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         24(%[src])     \n\t"
             "ulw              %[tp2],         24(%[dst])     \n\t"
-            "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         28(%[src])     \n\t"
             "ulw              %[tp4],         28(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         32(%[src])     \n\t"
             "ulw              %[tp2],         32(%[dst])     \n\t"
-            "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         36(%[src])     \n\t"
             "ulw              %[tp4],         36(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         40(%[src])     \n\t"
             "ulw              %[tp2],         40(%[dst])     \n\t"
-            "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         32(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         36(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         44(%[src])     \n\t"
             "ulw              %[tp4],         44(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         48(%[src])     \n\t"
             "ulw              %[tp2],         48(%[dst])     \n\t"
-            "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         40(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         44(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         52(%[src])     \n\t"
             "ulw              %[tp4],         52(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
             "ulw              %[tp1],         56(%[src])     \n\t"
             "ulw              %[tp2],         56(%[dst])     \n\t"
-            "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
+            "sw               %[tn1],         48(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         52(%[dst])     \n\t" /* store */
             "ulw              %[tp3],         60(%[src])     \n\t"
             "ulw              %[tp4],         60(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
-            "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
-            "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
-
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
-              [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
+            "sw               %[tn1],         56(%[dst])     \n\t" /* store */
+            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
+            "sw               %[tn2],         60(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
diff --git a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index db0c2a4da5ad6c317ffe17acdef0e357df73e9a6..9a9bab25a59a79a087a121e3e4095d5f395aaa2b 100644
--- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -18,16 +18,13 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
-                                       int32_t src_stride,
-                                       uint8_t *dst,
-                                       int32_t dst_stride,
-                                       const int16_t *filter_x0,
-                                       int32_t h) {
+static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                       uint8_t *dst, int32_t dst_stride,
+                                       const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
-  int32_t  vector1b, vector2b, vector3b, vector4b;
-  int32_t  Temp1, Temp2, Temp3, Temp4;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
   uint32_t vector4a = 64;
   uint32_t tp1, tp2;
   uint32_t p1, p2, p3, p4;
@@ -45,7 +42,7 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],         0(%[src])                      \n\t"
         "ulw              %[tp2],         4(%[src])                      \n\t"
 
@@ -76,13 +73,13 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
         "extp             %[Temp3],       $ac2,           31             \n\t"
 
-        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
 
         /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
         "mtlo             %[vector4a],    $ac3                           \n\t"
         "mthi             $zero,          $ac3                           \n\t"
-        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
         "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
         "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
         "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
@@ -93,46 +90,44 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
         "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
         "extp             %[Temp2],       $ac3,           31             \n\t"
 
-        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
 
         /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
         "mtlo             %[vector4a],    $ac2                           \n\t"
         "mthi             $zero,          $ac2                           \n\t"
         "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
-        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
         "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
         "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
         "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
         "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
         "extp             %[Temp4],       $ac2,           31             \n\t"
 
-        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
-        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
 
         /* clamp */
-        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
-        "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
-        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
-
-        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
-        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
-
-        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
-        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
-
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
-        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-          [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
+        "lbux             %[n2],          %[Temp4](%[cm])                \n\t" /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
+
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
+
+        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t" /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -140,12 +135,9 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
-                                       int32_t src_stride,
-                                       uint8_t *dst,
-                                       int32_t dst_stride,
-                                       const int16_t *filter_x0,
-                                       int32_t h) {
+static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+                                       uint8_t *dst, int32_t dst_stride,
+                                       const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
@@ -167,7 +159,7 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],         0(%[src])                      \n\t"
         "ulw              %[tp2],         4(%[src])                      \n\t"
 
@@ -309,17 +301,15 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
         "sb               %[tn3],         5(%[dst])                      \n\t"
         "sb               %[tn1],         7(%[dst])                      \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
-          [st0] "=&r" (st0), [st1] "=&r" (st1),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [n1] "=&r" (n1),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-          [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -328,11 +318,9 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
 }
 
 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        uint8_t *dst_ptr,
+                                        int32_t src_stride, uint8_t *dst_ptr,
                                         int32_t dst_stride,
-                                        const int16_t *filter_x0,
-                                        int32_t h,
+                                        const int16_t *filter_x0, int32_t h,
                                         int32_t count) {
   int32_t y, c;
   const uint8_t *src;
@@ -360,7 +348,7 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -618,16 +606,15 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter12] "r" (filter12), [filter34] "r" (filter34),
-            [filter56] "r" (filter56), [filter78] "r" (filter78),
-            [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+            [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -640,11 +627,9 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
 }
 
 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        uint8_t *dst_ptr,
+                                        int32_t src_stride, uint8_t *dst_ptr,
                                         int32_t dst_stride,
-                                        const int16_t *filter_x0,
-                                        int32_t h) {
+                                        const int16_t *filter_x0, int32_t h) {
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
@@ -673,7 +658,7 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -931,16 +916,15 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter12] "r" (filter12), [filter34] "r" (filter34),
-            [filter56] "r" (filter56), [filter78] "r" (filter78),
-            [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
+            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
+            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+            [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -961,22 +945,17 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
   if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_avg_horiz_dspr2(src, src_stride,
-                                  dst, dst_stride,
-                                  filter_x, x_step_q4,
-                                  filter_y, y_step_q4,
-                                  w, h);
+    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                  x_step_q4, filter_y, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
     src -= 3;
 
     /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
 
     /* prefetch data to cache memory */
     prefetch_load(src);
@@ -985,39 +964,32 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
     switch (w) {
       case 4:
-        convolve_avg_horiz_4_dspr2(src, src_stride,
-                                   dst, dst_stride,
-                                   filter_x, h);
+        convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                   h);
         break;
       case 8:
-        convolve_avg_horiz_8_dspr2(src, src_stride,
-                                   dst, dst_stride,
-                                   filter_x, h);
+        convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                   h);
         break;
       case 16:
-        convolve_avg_horiz_16_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_x, h, 1);
+        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                    h, 1);
         break;
       case 32:
-        convolve_avg_horiz_16_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_x, h, 2);
+        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                    h, 2);
         break;
       case 64:
         prefetch_load(src + 64);
         prefetch_store(dst + 32);
 
-        convolve_avg_horiz_64_dspr2(src, src_stride,
-                                    dst, dst_stride,
-                                    filter_x, h);
+        convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                                    h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src + 3, src_stride,
-                                  dst, dst_stride,
-                                  filter_x, x_step_q4,
-                                  filter_y, y_step_q4,
-                                  w, h);
+        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                  h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/convolve8_dspr2.c b/vpx_dsp/mips/convolve8_dspr2.c
index ddad186922835c8933fa1c377e7e966b43dc4925..789ec8d53d8313ae7672b041a1a858d53ad85745 100644
--- a/vpx_dsp/mips/convolve8_dspr2.c
+++ b/vpx_dsp/mips/convolve8_dspr2.c
@@ -19,8 +19,7 @@
 
 #if HAVE_DSPR2
 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               const int16_t *filter_x0,
                                               int32_t h) {
@@ -45,7 +44,7 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride);
     prefetch_load(src + src_stride + 32);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],         0(%[src])                      \n\t"
         "ulw              %[tp2],         4(%[src])                      \n\t"
 
@@ -118,15 +117,14 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
-          [dst_ptr] "+r" (dst_ptr)
-        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-          [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+          [dst_stride] "r"(dst_stride));
 
     /* Next row... */
     src += src_stride;
@@ -135,8 +133,7 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
 }
 
 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               const int16_t *filter_x0,
                                               int32_t h) {
@@ -164,7 +161,7 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp2],         0(%[src])                       \n\t"
         "ulw              %[tp1],         4(%[src])                       \n\t"
 
@@ -293,16 +290,14 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
 
         "sb               %[n1],          0(%[odd_dst])                   \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [n1] "=&r" (n1),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
-        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-          [vector4a] "r" (vector4a), [cm] "r" (cm),
-          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+          [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
+          [dst_pitch_2] "r"(dst_pitch_2));
 
     /* Next row... */
     src += src_stride;
@@ -310,25 +305,21 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
-                                               int32_t src_stride,
-                                               uint8_t *dst_ptr,
-                                               int32_t dst_stride,
-                                               const int16_t *filter_x0,
-                                               int32_t h,
-                                               int32_t count) {
+static void convolve_horiz_16_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
   int32_t c, y;
   const uint8_t *src;
   uint8_t *dst;
   uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
-  int32_t  filter12, filter34, filter56, filter78;
-  int32_t  Temp1, Temp2, Temp3;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
   uint32_t qload1, qload2;
   uint32_t p1, p2, p3, p4, p5;
   uint32_t st1, st2, st3;
   uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t  *odd_dst;
+  uint8_t *odd_dst;
 
   filter12 = ((const int32_t *)filter_x0)[0];
   filter34 = ((const int32_t *)filter_x0)[1];
@@ -346,248 +337,439 @@ static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
     odd_dst = (dst + dst_stride);
 
     for (c = 0; c < count; c++) {
-      __asm__ __volatile__ (
-          "ulw              %[qload1],        0(%[src])                       \n\t"
-          "ulw              %[qload2],        4(%[src])                       \n\t"
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
 
           /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "ulw              %[qload2],        8(%[src])                       \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
 
           /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
-          "ulw              %[qload1],        12(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
 
           /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
+          "\n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
+          "\n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
+          "\n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
 
           /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        16(%[src])                      \n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        16(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
+          "\n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
+          "\n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
+          "\n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
 
           /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
+          "\n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
+          "\n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
+          "\n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
 
           /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        20(%[src])                      \n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
+          "\n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
+          "\n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
+          "\n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
 
           /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
+          "\n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
+          "\n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
+          "\n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
 
           /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
+          "\n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
+          "\n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
+          "\n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
 
           /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       \n\t"
-          "ulw              %[qload2],        5(%[src])                       \n\t"
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
 
           /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        9(%[src])                       \n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
+          "\n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
+          "\n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
+          "\n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
 
           /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
+          "\n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
+          "\n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
+          "\n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
 
           /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
+          "\n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
+          "\n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
+          "\n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
 
           /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        17(%[src])                      \n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        17(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
+          "\n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
+          "\n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
+          "\n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
 
           /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
+          "\n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
+          "\n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
+          "\n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
 
           /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        21(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
+          "\n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
+          "\n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
+          "\n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
 
           /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
+          "\n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
+          "\n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
+          "\n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
 
           /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
-          : [filter12] "r" (filter12), [filter34] "r" (filter34),
-            [filter56] "r" (filter56), [filter78] "r" (filter78),
-            [vector_64] "r" (vector_64), [cm] "r" (cm),
-            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
-      );
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
+          "\n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
+          "\n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
+          "\n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+            [dst_pitch_2] "r"(dst_pitch_2));
 
       src += 16;
       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -601,24 +783,21 @@ static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
   }
 }
 
-static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
-                                               int32_t src_stride,
-                                               uint8_t *dst_ptr,
-                                               int32_t dst_stride,
-                                               const int16_t *filter_x0,
-                                               int32_t h) {
+static void convolve_horiz_64_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
   int32_t c, y;
   const uint8_t *src;
   uint8_t *dst;
   uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector_64 = 64;
-  int32_t  filter12, filter34, filter56, filter78;
-  int32_t  Temp1, Temp2, Temp3;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
   uint32_t qload1, qload2;
   uint32_t p1, p2, p3, p4, p5;
   uint32_t st1, st2, st3;
   uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t  *odd_dst;
+  uint8_t *odd_dst;
 
   filter12 = ((const int32_t *)filter_x0)[0];
   filter34 = ((const int32_t *)filter_x0)[1];
@@ -637,248 +816,439 @@ static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
     odd_dst = (dst + dst_stride);
 
     for (c = 0; c < 4; c++) {
-      __asm__ __volatile__ (
-          "ulw              %[qload1],        0(%[src])                       \n\t"
-          "ulw              %[qload2],        4(%[src])                       \n\t"
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
 
           /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "ulw              %[qload2],        8(%[src])                       \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
 
           /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
-          "ulw              %[qload1],        12(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
+          "\n\t" /* even 1 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
 
           /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
+          "\n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
+          "\n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
+          "\n\t" /* even 3 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
 
           /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        16(%[src])                      \n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        16(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
+          "\n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
+          "\n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
+          "\n\t" /* even 4 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
 
           /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
+          "\n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
+          "\n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
+          "\n\t" /* even 5 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
 
           /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        20(%[src])                      \n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
+          "\n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
+          "\n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
+          "\n\t" /* even 6 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
 
           /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
+          "\n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
+          "\n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
+          "\n\t" /* even 7 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
 
           /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
+          "\n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
+          "\n\t" /* even 8 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
+          "\n\t" /* even 8 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
 
           /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       \n\t"
-          "ulw              %[qload2],        5(%[src])                       \n\t"
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
 
           /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        9(%[src])                       \n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
+          "\n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
+          "\n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
+          "\n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
 
           /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      \n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
+          "\n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
+          "\n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
+          "\n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
 
           /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
+          "\n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
+          "\n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
+          "\n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
 
           /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "ulw              %[qload2],        17(%[src])                      \n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        17(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
+          "\n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
+          "\n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
+          "\n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
 
           /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            \n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
+          "\n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
+          "\n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
+          "\n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
 
           /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            \n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "ulw              %[qload1],        21(%[src])                      \n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p1],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
+          "\n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
+          "\n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
+          "\n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
 
           /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            \n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
+          "\n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
+          "\n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
+          "\n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
 
           /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
-          : [filter12] "r" (filter12), [filter34] "r" (filter34),
-            [filter56] "r" (filter56), [filter78] "r" (filter78),
-            [vector_64] "r" (vector_64), [cm] "r" (cm),
-            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
-      );
+          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
+          "\n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
+          "\n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
+          "\n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
+            [dst_pitch_2] "r"(dst_pitch_2));
 
       src += 16;
       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
@@ -901,8 +1271,7 @@ void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
     for (x = 0; x < w; ++x) {
       int sum = 0;
 
-      for (k = 0; k < 8; ++k)
-        sum += src[x + k] * filter[k];
+      for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
 
       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     }
@@ -913,8 +1282,7 @@ void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
 }
 
 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           int w, int h) {
+                           uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
   int x, y;
 
   for (y = 0; y < h; ++y) {
@@ -927,10 +1295,9 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
@@ -941,27 +1308,20 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(((const int32_t *)filter_x)[1] != 0x800000);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
-
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
-  if (intermediate_height < h)
-    intermediate_height = h;
+  if (intermediate_height < h) intermediate_height = h;
 
   /* copy the src to dst */
   if (filter_x[3] == 0x80) {
-    copy_horiz_transposed(src - src_stride * 3, src_stride,
-                          temp, intermediate_height,
-                          w, intermediate_height);
+    copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
+                          intermediate_height, w, intermediate_height);
   } else if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_dspr2(src - src_stride * 3, src_stride,
-                        temp, intermediate_height,
-                        filter_x,
-                        w, intermediate_height);
+    vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
+                        intermediate_height, filter_x, w, intermediate_height);
   } else {
     src -= (src_stride * 3 + 3);
 
@@ -971,31 +1331,29 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
     switch (w) {
       case 4:
-        convolve_horiz_4_transposed_dspr2(src, src_stride,
-                                          temp, intermediate_height,
-                                          filter_x, intermediate_height);
+        convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
+                                          intermediate_height, filter_x,
+                                          intermediate_height);
         break;
       case 8:
-        convolve_horiz_8_transposed_dspr2(src, src_stride,
-                                          temp, intermediate_height,
-                                          filter_x, intermediate_height);
+        convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
+                                          intermediate_height, filter_x,
+                                          intermediate_height);
         break;
       case 16:
       case 32:
-        convolve_horiz_16_transposed_dspr2(src, src_stride,
-                                           temp, intermediate_height,
-                                           filter_x, intermediate_height,
-                                           (w/16));
+        convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
+                                           intermediate_height, filter_x,
+                                           intermediate_height, (w / 16));
         break;
       case 64:
         prefetch_load(src + 32);
-        convolve_horiz_64_transposed_dspr2(src, src_stride,
-                                           temp, intermediate_height,
-                                           filter_x, intermediate_height);
+        convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
+                                           intermediate_height, filter_x,
+                                           intermediate_height);
         break;
       default:
-        convolve_horiz_transposed(src, src_stride,
-                                  temp, intermediate_height,
+        convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
                                   filter_x, w, intermediate_height);
         break;
     }
@@ -1003,40 +1361,31 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
   /* copy the src to dst */
   if (filter_y[3] == 0x80) {
-    copy_horiz_transposed(temp + 3, intermediate_height,
-                          dst, dst_stride,
-                          h, w);
+    copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
   } else if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_dspr2(temp + 3, intermediate_height,
-                        dst, dst_stride,
-                        filter_y,
-                        h, w);
+    vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
+                        filter_y, h, w);
   } else {
     switch (h) {
       case 4:
-        convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
-                                          dst, dst_stride,
-                                          filter_y, w);
+        convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
+                                          dst_stride, filter_y, w);
         break;
       case 8:
-        convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
-                                          dst, dst_stride,
-                                          filter_y, w);
+        convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
+                                          dst_stride, filter_y, w);
         break;
       case 16:
       case 32:
-        convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
-                                           dst, dst_stride,
-                                           filter_y, w, (h/16));
+        convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
+                                           dst_stride, filter_y, w, (h / 16));
         break;
       case 64:
-        convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
-                                           dst, dst_stride,
-                                           filter_y, w);
+        convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
+                                           dst_stride, filter_y, w);
         break;
       default:
-        convolve_horiz_transposed(temp, intermediate_height,
-                                  dst, dst_stride,
+        convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
                                   filter_y, h, w);
         break;
     }
@@ -1056,97 +1405,87 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   prefetch_store(dst);
 
   switch (w) {
-    case 4:
-      {
+    case 4: {
       uint32_t tp1;
 
       /* 1 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         (%[src])      \n\t"
-            "sw               %[tp1],         (%[dst])      \n\t"  /* store */
+            "sw               %[tp1],         (%[dst])      \n\t" /* store */
 
-            : [tp1] "=&r" (tp1)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            : [tp1] "=&r"(tp1)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
       }
-      }
-      break;
-    case 8:
-      {
+    } break;
+    case 8: {
       uint32_t tp1, tp2;
 
       /* 2 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         4(%[src])      \n\t"
-            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
 
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
       }
-      }
-      break;
-    case 16:
-      {
+    } break;
+    case 16: {
       uint32_t tp1, tp2, tp3, tp4;
 
       /* 4 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         4(%[src])      \n\t"
             "ulw              %[tp3],         8(%[src])      \n\t"
             "ulw              %[tp4],         12(%[src])     \n\t"
 
-            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
 
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
       }
-      }
-      break;
-    case 32:
-      {
+    } break;
+    case 32: {
       uint32_t tp1, tp2, tp3, tp4;
       uint32_t tp5, tp6, tp7, tp8;
 
       /* 8 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_store(dst + dst_stride);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         4(%[src])      \n\t"
             "ulw              %[tp3],         8(%[src])      \n\t"
@@ -1156,29 +1495,25 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
             "ulw              %[tp7],         24(%[src])     \n\t"
             "ulw              %[tp8],         28(%[src])     \n\t"
 
-            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
-            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
-            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
-            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
-            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
-
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
-              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
-              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
       }
-      }
-      break;
-    case 64:
-      {
+    } break;
+    case 64: {
       uint32_t tp1, tp2, tp3, tp4;
       uint32_t tp5, tp6, tp7, tp8;
 
@@ -1186,14 +1521,14 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       prefetch_store(dst + 32);
 
       /* 16 word storage */
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         prefetch_load(src + src_stride);
         prefetch_load(src + src_stride + 32);
         prefetch_load(src + src_stride + 64);
         prefetch_store(dst + dst_stride);
         prefetch_store(dst + dst_stride + 32);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "ulw              %[tp1],         0(%[src])      \n\t"
             "ulw              %[tp2],         4(%[src])      \n\t"
             "ulw              %[tp3],         8(%[src])      \n\t"
@@ -1203,14 +1538,14 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
             "ulw              %[tp7],         24(%[src])     \n\t"
             "ulw              %[tp8],         28(%[src])     \n\t"
 
-            "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
-            "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
-            "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
-            "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
-            "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
 
             "ulw              %[tp1],         32(%[src])     \n\t"
             "ulw              %[tp2],         36(%[src])     \n\t"
@@ -1221,29 +1556,26 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
             "ulw              %[tp7],         56(%[src])     \n\t"
             "ulw              %[tp8],         60(%[src])     \n\t"
 
-            "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
-            "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
-            "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
-            "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
-            "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
-            "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
-            "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
-            "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
-
-            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-              [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
-              [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
-              [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
-            : [src] "r" (src), [dst] "r" (dst)
-        );
+            "sw               %[tp1],         32(%[dst])     \n\t" /* store */
+            "sw               %[tp2],         36(%[dst])     \n\t" /* store */
+            "sw               %[tp3],         40(%[dst])     \n\t" /* store */
+            "sw               %[tp4],         44(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         48(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         52(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         56(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         60(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+            : [src] "r"(src), [dst] "r"(dst));
 
         src += src_stride;
         dst += dst_stride;
       }
-      }
-      break;
+    } break;
     default:
-      for (y = h; y--; ) {
+      for (y = h; y--;) {
         for (x = 0; x < w; ++x) {
           dst[x] = src[x];
         }
diff --git a/vpx_dsp/mips/convolve8_horiz_dspr2.c b/vpx_dsp/mips/convolve8_horiz_dspr2.c
index ae78bab8924832ca60ec8a1452f51484f9a7a260..196a0a2f0be98384895dd6b067700b17ceabcb90 100644
--- a/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -18,12 +18,9 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_horiz_4_dspr2(const uint8_t *src,
-                                   int32_t src_stride,
-                                   uint8_t *dst,
-                                   int32_t dst_stride,
-                                   const int16_t *filter_x0,
-                                   int32_t h) {
+static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
   int32_t vector1b, vector2b, vector3b, vector4b;
@@ -45,7 +42,7 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],      0(%[src])                      \n\t"
         "ulw              %[tp2],      4(%[src])                      \n\t"
 
@@ -111,17 +108,15 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
         "sb               %[tp2],      2(%[dst])                      \n\t"
         "sb               %[n2],       3(%[dst])                      \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
-        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-          [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -129,12 +124,9 @@ static void convolve_horiz_4_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_horiz_8_dspr2(const uint8_t *src,
-                                   int32_t src_stride,
-                                   uint8_t *dst,
-                                   int32_t dst_stride,
-                                   const int16_t *filter_x0,
-                                   int32_t h) {
+static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_x0, int32_t h) {
   int32_t y;
   uint8_t *cm = vpx_ff_cropTbl;
   uint32_t vector4a = 64;
@@ -156,7 +148,7 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
     prefetch_load(src + src_stride + 32);
     prefetch_store(dst + dst_stride);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "ulw              %[tp1],      0(%[src])                      \n\t"
         "ulw              %[tp2],      4(%[src])                      \n\t"
 
@@ -275,17 +267,15 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
         "sb               %[p2],       5(%[dst])                      \n\t"
         "sb               %[n1],       7(%[dst])                      \n\t"
 
-        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
-          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
-          [st0] "=&r" (st0), [st1] "=&r" (st1),
-          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-          [n1] "=&r" (n1),
-          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-        : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-          [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-          [vector4a] "r" (vector4a),
-          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
-    );
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
 
     /* Next row... */
     src += src_stride;
@@ -293,12 +283,9 @@ static void convolve_horiz_8_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
-                                    int32_t src_stride,
-                                    uint8_t *dst_ptr,
-                                    int32_t dst_stride,
-                                    const int16_t *filter_x0,
-                                    int32_t h,
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    const int16_t *filter_x0, int32_t h,
                                     int32_t count) {
   int32_t y, c;
   const uint8_t *src;
@@ -326,7 +313,7 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -542,17 +529,15 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter12] "r" (filter12), [filter34] "r" (filter34),
-            [filter56] "r" (filter56), [filter78] "r" (filter78),
-            [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst),
-            [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -564,12 +549,9 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
   }
 }
 
-static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
-                                    int32_t src_stride,
-                                    uint8_t *dst_ptr,
-                                    int32_t dst_stride,
-                                    const int16_t *filter_x0,
-                                    int32_t h) {
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    const int16_t *filter_x0, int32_t h) {
   int32_t y, c;
   const uint8_t *src;
   uint8_t *dst;
@@ -598,7 +580,7 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
     prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[qload1],    0(%[src])                    \n\t"
           "ulw              %[qload2],    4(%[src])                    \n\t"
 
@@ -814,17 +796,15 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
 
-          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
-            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
-            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
-            [p5] "=&r" (p5),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
-          : [filter12] "r" (filter12), [filter34] "r" (filter34),
-            [filter56] "r" (filter56), [filter78] "r" (filter78),
-            [vector_64] "r" (vector_64),
-            [cm] "r" (cm), [dst] "r" (dst),
-            [src] "r" (src)
-      );
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
 
       src += 16;
       dst += 16;
@@ -839,17 +819,14 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
   if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_horiz_dspr2(src, src_stride,
-                              dst, dst_stride,
-                              filter_x, x_step_q4,
-                              filter_y, y_step_q4,
-                              w, h);
+    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -857,11 +834,9 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
     src -= 3;
 
     /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
 
     /* prefetch data to cache memory */
     prefetch_load(src);
@@ -870,39 +845,31 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
     switch (w) {
       case 4:
-        convolve_horiz_4_dspr2(src, (int32_t)src_stride,
-                               dst, (int32_t)dst_stride,
-                               filter_x, (int32_t)h);
+        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+                               (int32_t)dst_stride, filter_x, (int32_t)h);
         break;
       case 8:
-        convolve_horiz_8_dspr2(src, (int32_t)src_stride,
-                               dst, (int32_t)dst_stride,
-                               filter_x, (int32_t)h);
+        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+                               (int32_t)dst_stride, filter_x, (int32_t)h);
         break;
       case 16:
-        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
-                                dst, (int32_t)dst_stride,
-                                filter_x, (int32_t)h, 1);
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
         break;
       case 32:
-        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
-                                dst, (int32_t)dst_stride,
-                                filter_x, (int32_t)h, 2);
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
         break;
       case 64:
         prefetch_load(src + 64);
         prefetch_store(dst + 32);
 
-        convolve_horiz_64_dspr2(src, (int32_t)src_stride,
-                                dst, (int32_t)dst_stride,
-                                filter_x, (int32_t)h);
+        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
         break;
       default:
-        vpx_convolve8_horiz_c(src + 3, src_stride,
-                              dst, dst_stride,
-                              filter_x, x_step_q4,
-                              filter_y, y_step_q4,
-                              w, h);
+        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/convolve8_vert_dspr2.c b/vpx_dsp/mips/convolve8_vert_dspr2.c
index d553828c59a2b4a51f3cb65f0ba93281aeeff217..ad107d5c47309d8b9540ac0d43fd9464e7052482 100644
--- a/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -18,12 +18,9 @@
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_vert_4_dspr2(const uint8_t *src,
-                                  int32_t src_stride,
-                                  uint8_t *dst,
-                                  int32_t dst_stride,
-                                  const int16_t *filter_y,
-                                  int32_t w,
+static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int16_t *filter_y, int32_t w,
                                   int32_t h) {
   int32_t x, y;
   const uint8_t *src_ptr;
@@ -53,7 +50,7 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -152,19 +149,16 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [load3] "=&r" (load3), [load4] "=&r" (load4),
-            [p1] "=&r" (p1), [p2] "=&r" (p2),
-            [n1] "=&r" (n1), [n2] "=&r" (n2),
-            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
-            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -173,12 +167,9 @@ static void convolve_vert_4_dspr2(const uint8_t *src,
   }
 }
 
-static void convolve_vert_64_dspr2(const uint8_t *src,
-                                   int32_t src_stride,
-                                   uint8_t *dst,
-                                   int32_t dst_stride,
-                                   const int16_t *filter_y,
-                                   int32_t h) {
+static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_y, int32_t h) {
   int32_t x, y;
   const uint8_t *src_ptr;
   uint8_t *dst_ptr;
@@ -208,7 +199,7 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
       src_ptr = src + x;
       dst_ptr = dst + x;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
@@ -307,19 +298,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
 
-          : [load1] "=&r" (load1), [load2] "=&r" (load2),
-            [load3] "=&r" (load3), [load4] "=&r" (load4),
-            [p1] "=&r" (p1), [p2] "=&r" (p2),
-            [n1] "=&r" (n1), [n2] "=&r" (n2),
-            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
-            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
-            [store1] "=&r" (store1), [store2] "=&r" (store2),
-            [src_ptr] "+r" (src_ptr)
-          : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
-            [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
-            [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
-            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
-      );
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
     }
 
     /* Next row... */
@@ -331,50 +319,38 @@ static void convolve_vert_64_dspr2(const uint8_t *src,
 void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
   if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_vert_dspr2(src, src_stride,
-                             dst, dst_stride,
-                             filter_x, x_step_q4,
-                             filter_y, y_step_q4,
-                             w, h);
+    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
     /* bit positon for extract from acc */
-    __asm__ __volatile__ (
-      "wrdsp      %[pos],     1           \n\t"
-      :
-      : [pos] "r" (pos)
-    );
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
 
     prefetch_store(dst);
 
     switch (w) {
-      case 4 :
-      case 8 :
-      case 16 :
-      case 32 :
-        convolve_vert_4_dspr2(src, src_stride,
-                              dst, dst_stride,
-                              filter_y, w, h);
+      case 4:
+      case 8:
+      case 16:
+      case 32:
+        convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
         break;
-      case 64 :
+      case 64:
         prefetch_store(dst + 32);
-        convolve_vert_64_dspr2(src, src_stride,
-                               dst, dst_stride,
-                               filter_y, h);
+        convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride,
-                             dst, dst_stride,
-                             filter_x, x_step_q4,
-                             filter_y, y_step_q4,
-                             w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/convolve_common_dspr2.h b/vpx_dsp/mips/convolve_common_dspr2.h
index 66d77a28544b9619432f646f530943899f320968..4eee3bd5e1580f54648c49ea2b078f7777947d4f 100644
--- a/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/vpx_dsp/mips/convolve_common_dspr2.h
@@ -25,8 +25,8 @@ extern "C" {
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h);
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h);
 
 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
@@ -37,19 +37,18 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
-                                  int w, int h);
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h);
 
-void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter,
-                         int w, int h);
+void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter, int w,
+                         int h);
 
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h);
 
 #endif  // #if HAVE_DSPR2
 #ifdef __cplusplus
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
index e98a0399ba3c4fd95bcf29f9a48f5cc160b397fb..402d7ed99793b2831389b08c4c377fe37887a1de 100644
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -13,133 +13,132 @@
 
 extern const int16_t vpx_rv[];
 
-#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
-                                out0, out1, out2, out3,                  \
-                                out4, out5, out6, out7,                  \
-                                out8, out9, out10, out11,                \
-                                out12, out13, out14, out15)              \
-{                                                                        \
-    v8i16 temp0, temp1, temp2, temp3, temp4;                             \
-    v8i16 temp5, temp6, temp7, temp8, temp9;                             \
-                                                                         \
-    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
-               temp0, temp1, temp2, temp3);                              \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
-    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                             \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
-    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                             \
-    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                   \
-               temp0, temp1, temp2, temp3);                              \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
-    ILVRL_W2_UB(temp5, temp4, out8, out10);                              \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                \
-    ILVRL_W2_UB(temp5, temp4, out12, out14);                             \
-    out0 = (v16u8)temp6;                                                 \
-    out2 = (v16u8)temp7;                                                 \
-    out4 = (v16u8)temp8;                                                 \
-    out6 = (v16u8)temp9;                                                 \
-    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                \
-    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);             \
-    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);             \
-    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);             \
-    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                \
-    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                \
-    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                \
-    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                \
-}
+#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0,  \
+                                out1, out2, out3, out4, out5, out6, out7,      \
+                                out8, out9, out10, out11, out12, out13, out14, \
+                                out15)                                         \
+  {                                                                            \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                                   \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                                   \
+                                                                               \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
+               temp3);                                                         \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                   \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                   \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
+               temp3);                                                         \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                                    \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                                   \
+    out0 = (v16u8)temp6;                                                       \
+    out2 = (v16u8)temp7;                                                       \
+    out4 = (v16u8)temp8;                                                       \
+    out6 = (v16u8)temp9;                                                       \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                      \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                   \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                   \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                   \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                      \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                      \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                      \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                      \
+  }
 
-#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in,    \
-                           below1_in, below2_in, ref, out)  \
-{                                                           \
-    v16u8 temp0, temp1;                                     \
-                                                            \
-    temp1 = __msa_aver_u_b(above2_in, above1_in);           \
-    temp0 = __msa_aver_u_b(below2_in, below1_in);           \
-    temp1 = __msa_aver_u_b(temp1, temp0);                   \
-    out = __msa_aver_u_b(src_in, temp1);                    \
-    temp0 = __msa_asub_u_b(src_in, above2_in);              \
-    temp1 = __msa_asub_u_b(src_in, above1_in);              \
-    temp0 = (temp0 < ref);                                  \
-    temp1 = (temp1 < ref);                                  \
-    temp0 = temp0 & temp1;                                  \
-    temp1 = __msa_asub_u_b(src_in, below1_in);              \
-    temp1 = (temp1 < ref);                                  \
-    temp0 = temp0 & temp1;                                  \
-    temp1 = __msa_asub_u_b(src_in, below2_in);              \
-    temp1 = (temp1 < ref);                                  \
-    temp0 = temp0 & temp1;                                  \
-    out = __msa_bmz_v(out, src_in, temp0);                  \
-}
+#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
+                           ref, out)                                           \
+  {                                                                            \
+    v16u8 temp0, temp1;                                                        \
+                                                                               \
+    temp1 = __msa_aver_u_b(above2_in, above1_in);                              \
+    temp0 = __msa_aver_u_b(below2_in, below1_in);                              \
+    temp1 = __msa_aver_u_b(temp1, temp0);                                      \
+    out = __msa_aver_u_b(src_in, temp1);                                       \
+    temp0 = __msa_asub_u_b(src_in, above2_in);                                 \
+    temp1 = __msa_asub_u_b(src_in, above1_in);                                 \
+    temp0 = (temp0 < ref);                                                     \
+    temp1 = (temp1 < ref);                                                     \
+    temp0 = temp0 & temp1;                                                     \
+    temp1 = __msa_asub_u_b(src_in, below1_in);                                 \
+    temp1 = (temp1 < ref);                                                     \
+    temp0 = temp0 & temp1;                                                     \
+    temp1 = __msa_asub_u_b(src_in, below2_in);                                 \
+    temp1 = (temp1 < ref);                                                     \
+    temp0 = temp0 & temp1;                                                     \
+    out = __msa_bmz_v(out, src_in, temp0);                                     \
+  }
 
-#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7,        \
-                         in8, in9, in10, in11, in12, in13, in14, in15)  \
-{                                                                       \
-    v8i16 temp0, temp1, temp2, temp3, temp4;                            \
-    v8i16 temp5, temp6, temp7, temp8, temp9;                            \
-                                                                        \
-    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                       \
-    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                            \
-    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
-    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                            \
-    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                            \
-    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                            \
-    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
-    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                     \
-    ILVRL_H2_SH(temp5, temp4, temp6, temp7);                            \
-    ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5);                   \
-    ILVRL_H2_SH(temp5, temp4, temp8, temp9);                            \
-    ILVRL_W2_SH(temp8, temp6, temp4, temp5);                            \
-    ILVRL_W2_SH(temp9, temp7, temp6, temp7);                            \
-    ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9);                       \
-    ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2);                   \
-    in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0);              \
-    in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1);              \
-    ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1);                       \
-    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                   \
-    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);              \
-    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);              \
-    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14,            \
-               temp2, temp3, temp4, temp5);                             \
-    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4,  \
-               temp6, temp7, temp8, temp9);                             \
-    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);               \
-    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);              \
-    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);              \
-    ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3);               \
-    in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2);             \
-    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);             \
-}
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,    \
+                         in10, in11, in12, in13, in14, in15)                  \
+  {                                                                           \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                                  \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                                  \
+                                                                              \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                             \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                                  \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                             \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                                  \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                                  \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                                  \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                           \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                           \
+    ILVRL_H2_SH(temp5, temp4, temp6, temp7);                                  \
+    ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5);                         \
+    ILVRL_H2_SH(temp5, temp4, temp8, temp9);                                  \
+    ILVRL_W2_SH(temp8, temp6, temp4, temp5);                                  \
+    ILVRL_W2_SH(temp9, temp7, temp6, temp7);                                  \
+    ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9);                             \
+    ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2);                         \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0);                    \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1);                    \
+    ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1);                             \
+    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                         \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);                    \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);                    \
+    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3,    \
+               temp4, temp5);                                                 \
+    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
+               temp7, temp8, temp9);                                          \
+    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);                     \
+    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);                    \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);                    \
+    ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3);                     \
+    in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2);                   \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);                   \
+  }
 
-#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5,    \
-                                in6, in7, in8, in9, in10, in11)  \
-{                                                                \
-    v8i16 temp0, temp1, temp2, temp3;                            \
-    v8i16 temp4, temp5, temp6, temp7;                            \
-                                                                 \
-    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                \
-    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                     \
-    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                \
-    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                     \
-    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                     \
-    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                     \
-    ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5);                \
-    temp4 = __msa_ilvr_h(temp5, temp4);                          \
-    ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7);                \
-    temp5 = __msa_ilvr_h(temp7, temp6);                          \
-    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                     \
-    in0 = (v16u8)temp0;                                          \
-    in2 = (v16u8)temp1;                                          \
-    in4 = (v16u8)temp2;                                          \
-    in6 = (v16u8)temp3;                                          \
-    in8 = (v16u8)temp6;                                          \
-    in10 = (v16u8)temp7;                                         \
-    in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0);       \
-    in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1);       \
-    in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2);       \
-    in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3);       \
-    in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6);       \
-    in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7);      \
-}
+#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
+                                in9, in10, in11)                             \
+  {                                                                          \
+    v8i16 temp0, temp1, temp2, temp3;                                        \
+    v8i16 temp4, temp5, temp6, temp7;                                        \
+                                                                             \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                            \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                                 \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                            \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                                 \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                                 \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                                 \
+    ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5);                            \
+    temp4 = __msa_ilvr_h(temp5, temp4);                                      \
+    ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7);                            \
+    temp5 = __msa_ilvr_h(temp7, temp6);                                      \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                 \
+    in0 = (v16u8)temp0;                                                      \
+    in2 = (v16u8)temp1;                                                      \
+    in4 = (v16u8)temp2;                                                      \
+    in6 = (v16u8)temp3;                                                      \
+    in8 = (v16u8)temp6;                                                      \
+    in10 = (v16u8)temp7;                                                     \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0);                   \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1);                   \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2);                   \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3);                   \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6);                   \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7);                  \
+  }
 
 static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
                                             int32_t src_stride,
@@ -203,16 +202,16 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
     VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
     above1 = LD_UB(p_src + 9 * src_stride);
     VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
-    out0 = __msa_copy_u_d((v2i64) inter0, 0);
-    out1 = __msa_copy_u_d((v2i64) inter1, 0);
-    out2 = __msa_copy_u_d((v2i64) inter2, 0);
-    out3 = __msa_copy_u_d((v2i64) inter3, 0);
+    out0 = __msa_copy_u_d((v2i64)inter0, 0);
+    out1 = __msa_copy_u_d((v2i64)inter1, 0);
+    out2 = __msa_copy_u_d((v2i64)inter2, 0);
+    out3 = __msa_copy_u_d((v2i64)inter3, 0);
     SD4(out0, out1, out2, out3, p_dst, dst_stride);
 
-    out0 = __msa_copy_u_d((v2i64) inter4, 0);
-    out1 = __msa_copy_u_d((v2i64) inter5, 0);
-    out2 = __msa_copy_u_d((v2i64) inter6, 0);
-    out3 = __msa_copy_u_d((v2i64) inter7, 0);
+    out0 = __msa_copy_u_d((v2i64)inter4, 0);
+    out1 = __msa_copy_u_d((v2i64)inter5, 0);
+    out2 = __msa_copy_u_d((v2i64)inter6, 0);
+    out3 = __msa_copy_u_d((v2i64)inter7, 0);
     SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
   }
 
@@ -236,36 +235,36 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
     src = inter2;
     below1 = inter3;
     below2 = inter4;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
     VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
     above2 = inter5;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
     VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
     above1 = inter6;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
     VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
     src = inter7;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
     VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
     below1 = inter8;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
     VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
     below2 = inter9;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
     VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
     if (col == (cols / 8 - 1)) {
       above2 = inter9;
     } else {
       above2 = inter10;
     }
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
     VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
     if (col == (cols / 8 - 1)) {
       above1 = inter9;
     } else {
       above1 = inter11;
     }
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
     VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
     TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
                        inter9, inter2, inter3, inter4, inter5, inter6, inter7,
@@ -371,36 +370,36 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
     src = inter2;
     below1 = inter3;
     below2 = inter4;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
     VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
     above2 = inter5;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
     VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
     above1 = inter6;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
     VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
     src = inter7;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
     VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
     below1 = inter8;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
     VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
     below2 = inter9;
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
     VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
     if (col == (cols / 8 - 1)) {
       above2 = inter9;
     } else {
       above2 = inter10;
     }
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
     VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
     if (col == (cols / 8 - 1)) {
       above1 = inter9;
     } else {
       above1 = inter11;
     }
-    ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
     VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
     VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
                             inter8, inter9, inter2, inter3, inter4, inter5,
@@ -452,8 +451,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
   int32_t row, col, cnt;
   uint8_t *src_dup = src_ptr;
   v16u8 src0, src, tmp_orig;
-  v16u8 tmp = {0};
-  v16i8 zero = {0};
+  v16u8 tmp = { 0 };
+  v16i8 zero = { 0 };
   v8u16 sum_h, src_r_h, src_l_h;
   v4u32 src_r_w, src_l_w;
   v4i32 flimit_vec;
@@ -462,13 +461,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
   for (row = rows; row--;) {
     int32_t sum_sq = 0;
     int32_t sum = 0;
-    src0 = (v16u8) __msa_fill_b(src_dup[0]);
+    src0 = (v16u8)__msa_fill_b(src_dup[0]);
     ST8x1_UB(src0, (src_dup - 8));
 
-    src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]);
+    src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
     ST_UB(src0, src_dup + cols);
     src_dup[cols + 16] = src_dup[cols - 1];
-    tmp_orig = (v16u8) __msa_ldi_b(0);
+    tmp_orig = (v16u8)__msa_ldi_b(0);
     tmp_orig[15] = tmp[15];
     src = LD_UB(src_dup - 8);
     src[15] = 0;
@@ -508,9 +507,9 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
         sum = sum_l[7];
         src = LD_UB(src_dup + 16 * col);
         ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
-        src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4);
-        src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4);
-        tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7);
+        src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
+        src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
+        tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
 
         HADD_UB2_UH(src_r, src_l, add_r, add_l);
         UNPCK_SH_SW(sub_r, sub0, sub1);
@@ -552,13 +551,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
         total2 = (total2 < flimit_vec);
         total3 = (total3 < flimit_vec);
         PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
-        mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
-        tmp = __msa_bmz_v(tmp, src, (v16u8) mask);
+        mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
 
         if (col == 0) {
           uint64_t src_d;
 
-          src_d = __msa_copy_u_d((v2i64) tmp_orig, 1);
+          src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
           SD(src_d, (src_dup - 8));
         }
 
@@ -588,15 +587,15 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
   for (col = 0; col < (cols >> 4); ++col) {
     uint8_t *dst_tmp = &dst_ptr[col << 4];
     v16u8 dst;
-    v16i8 zero = {0};
+    v16i8 zero = { 0 };
     v16u8 tmp[16];
     v8i16 mult0, mult1, rv2_0, rv2_1;
-    v8i16 sum0_h = {0};
-    v8i16 sum1_h = {0};
-    v4i32 mul0 = {0};
-    v4i32 mul1 = {0};
-    v4i32 mul2 = {0};
-    v4i32 mul3 = {0};
+    v8i16 sum0_h = { 0 };
+    v8i16 sum1_h = { 0 };
+    v4i32 mul0 = { 0 };
+    v4i32 mul1 = { 0 };
+    v4i32 mul2 = { 0 };
+    v4i32 mul3 = { 0 };
     v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
     v4i32 add0, add1, add2, add3;
     const int16_t *rv2[16];
@@ -618,10 +617,10 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
       dst = LD_UB(dst_tmp + (cnt * pitch));
       UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
       MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
-      mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0);
-      mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0);
-      mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1);
-      mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1);
+      mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
+      mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
+      mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
+      mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
       ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
     }
 
@@ -652,7 +651,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
       ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
       dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
       dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
-      tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7);
+      tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
 
       UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
       UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
@@ -669,8 +668,8 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
       total2 = (total2 < flimit_vec);
       total3 = (total3 < flimit_vec);
       PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
-      mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
-      tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask);
+      mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+      tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
 
       if (row >= 8) {
         ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c
index f29c14b3d0e31154add3b6786d506853513b7e46..e41a904808e1826a8d6ee6a98ee2fef2cce8491c 100644
--- a/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -27,10 +27,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
   SLLI_4V(in4, in5, in6, in7, 2);
   SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
   SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
-              step0, step1, step2, step3, in4, in5, in6, in7);
-  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+              step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
   ST_SH4(step0, step1, step2, step3, temp_buff, 8);
   ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
   ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
@@ -45,10 +45,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
   SLLI_4V(in4, in5, in6, in7, 2);
   SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
   SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
-              step0, step1, step2, step3, in4, in5, in6, in7);
-  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+              step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
+              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
   ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
   ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
   ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
@@ -64,12 +64,12 @@ static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
   /* fdct even */
   LD_SH4(input, 8, in0, in1, in2, in3);
   LD_SH4(input + 96, 8, in12, in13, in14, in15);
-  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
-              vec0, vec1, vec2, vec3, in12, in13, in14, in15);
+  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
+              vec3, in12, in13, in14, in15);
   LD_SH4(input + 32, 8, in4, in5, in6, in7);
   LD_SH4(input + 64, 8, in8, in9, in10, in11);
-  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
-              vec4, vec5, vec6, vec7, in8, in9, in10, in11);
+  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
+              in8, in9, in10, in11);
 
   /* Stage 3 */
   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
@@ -258,28 +258,26 @@ static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
 
   LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
   LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
-                     in8, in9, in10, in11, in12, in13, in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
-               in8, in9, in10, in11, in12, in13, in14, in15,
-               step0, step1, step2, step3, step4, step5, step6, step7,
-               in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
   ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
 
   /* 2nd set */
   LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
   LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
-                     in8, in9, in10, in11, in12, in13, in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
-               in8, in9, in10, in11, in12, in13, in14, in15,
-               step0, step1, step2, step3, step4, step5, step6, step7,
-               in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
+               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
   ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
          (output + 8 * 8), 8);
   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
@@ -299,10 +297,9 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
   LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
   LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
 
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
-               in8, in9, in10, in11, in12, in13, in14, in15,
-               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
-               in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
   ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
 
@@ -315,19 +312,19 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
   UNPCK_SH_SW(vec5, vec5_l, vec5_r);
   UNPCK_SH_SW(vec6, vec6_l, vec6_r);
   UNPCK_SH_SW(vec7, vec7_l, vec7_r);
-  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
-       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
+       tmp1_w, tmp2_w, tmp3_w);
   BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
-  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
-       vec0_r, vec1_r, vec2_r, vec3_r);
+  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
+       vec1_r, vec2_r, vec3_r);
 
   tmp3_w = vec0_r + vec3_r;
   vec0_r = vec0_r - vec3_r;
   vec3_r = vec1_r + vec2_r;
   vec1_r = vec1_r - vec2_r;
 
-  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
-                    cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
   FDCT32_POSTPROC_NEG_W(vec4_r);
   FDCT32_POSTPROC_NEG_W(tmp3_w);
   FDCT32_POSTPROC_NEG_W(vec6_r);
@@ -335,8 +332,8 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
   PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
   ST_SH2(vec5, vec4, out, 8);
 
-  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
-                    cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
   FDCT32_POSTPROC_NEG_W(vec4_r);
   FDCT32_POSTPROC_NEG_W(tmp3_w);
   FDCT32_POSTPROC_NEG_W(vec6_r);
@@ -401,10 +398,9 @@ static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
   LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
   LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
 
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
-               in8, in9, in10, in11, in12, in13, in14, in15,
-               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
-               in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
 
   /* Stage 3 */
   ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
@@ -610,8 +606,8 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
   in3 = LD_SH(temp + 192);
   in5 = LD_SH(temp + 216);
 
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
 
   /* 2nd set */
   in0_1 = LD_SH(temp + 16);
@@ -637,10 +633,10 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
   in6 = LD_SH(temp + 104);
   in7 = LD_SH(temp + 144);
 
-  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-         output + 8, 32);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
+         32);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
 
   /* 4th set */
@@ -655,12 +651,11 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
 
   TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
                      in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
-  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-         output + 24, 32);
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
+         32);
 }
 
-static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
-                            int16_t *output) {
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
   fdct8x32_1d_row_load_butterfly(temp, temp_buf);
   fdct8x32_1d_row_even(temp_buf, temp_buf);
   fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
@@ -706,10 +701,9 @@ static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
   LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
   LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
 
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
-               in8, in9, in10, in11, in12, in13, in14, in15,
-               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
-               in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
+               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
   FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
   FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
   FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
diff --git a/vpx_dsp/mips/fwd_txfm_msa.c b/vpx_dsp/mips/fwd_txfm_msa.c
index 5571d220e42843006c171760732ac75340e098c1..cb3d6282d5a27b249db307517973c3966dab3ee6 100644
--- a/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/vpx_dsp/mips/fwd_txfm_msa.c
@@ -18,24 +18,24 @@ void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
   v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
   v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
-  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
-                 -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
-  v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
-                   cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
-  v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64,
-                   0, 0, 0, 0 };
-
-  LD_SH16(input, src_stride,
-          in0, in1, in2, in3, in4, in5, in6, in7,
-          in8, in9, in10, in11, in12, in13, in14, in15);
+  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,
+                  -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+  v8i16 coeff1 = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,
+                   cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };
+  v8i16 coeff2 = {
+    -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
+  };
+
+  LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
+          in10, in11, in12, in13, in14, in15);
   SLLI_4V(in0, in1, in2, in3, 2);
   SLLI_4V(in4, in5, in6, in7, 2);
   SLLI_4V(in8, in9, in10, in11, 2);
   SLLI_4V(in12, in13, in14, in15, 2);
   ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
   ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
-                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
   ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
   SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
   SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
@@ -137,10 +137,10 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
 
   LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
   LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
-                     in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
   ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
   ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
   ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
@@ -150,19 +150,19 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
   SRA_4V(in8, in9, in10, in11, 2);
   SRA_4V(in12, in13, in14, in15, 2);
   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
-               tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
+               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
-                tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
   LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
-                   in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
-                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+               in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+                     tmp1, in1, tmp2, in2, tmp3, in3);
   ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
-  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
-                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+                     tmp5, in5, tmp6, in6, tmp7, in7);
   ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
 }
 
@@ -203,14 +203,14 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
   LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
   SLLI_4V(in0, in1, in2, in3, 2);
   SLLI_4V(in4, in5, in6, in7, 2);
-  VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-            in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
-  VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-            in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
+  VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
   SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
 }
diff --git a/vpx_dsp/mips/fwd_txfm_msa.h b/vpx_dsp/mips/fwd_txfm_msa.h
index d7bb316d5be812eb01b8441b22f5ffde6968678e..6458dec6d20f3ebc11e78675b41d463ca0e8a5c2 100644
--- a/vpx_dsp/mips/fwd_txfm_msa.h
+++ b/vpx_dsp/mips/fwd_txfm_msa.h
@@ -14,358 +14,365 @@
 #include "vpx_dsp/mips/txfm_macros_msa.h"
 #include "vpx_dsp/txfm_common.h"
 
-#define LD_HADD(psrc, stride) ({                                      \
-  v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;       \
-  v4i32 vec_w_m;                                                      \
-                                                                      \
-  LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                 \
-  ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                     \
-  LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);  \
-  ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m,        \
-       in4_m, in6_m, in0_m, in4_m);                                   \
-  in0_m += in4_m;                                                     \
-                                                                      \
-  vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                             \
-  HADD_SW_S32(vec_w_m);                                               \
-})
+#define LD_HADD(psrc, stride)                                                  \
+  ({                                                                           \
+    v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;              \
+    v4i32 vec_w_m;                                                             \
+                                                                               \
+    LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                        \
+    ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                            \
+    LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);         \
+    ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
+         in0_m, in4_m);                                                        \
+    in0_m += in4_m;                                                            \
+                                                                               \
+    vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                                    \
+    HADD_SW_S32(vec_w_m);                                                      \
+  })
 
-#define VPX_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) {     \
-  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                         \
-  v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                             \
-  v4i32 vec4_m, vec5_m, vec6_m, vec7_m;                             \
-  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,          \
-                    cospi_24_64, -cospi_8_64, 0, 0, 0 };            \
-                                                                    \
-  BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);  \
-  ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m);       \
-  SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m);                    \
-  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                        \
-  vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m);                         \
-                                                                    \
-  SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m);                    \
-  cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m);                        \
-  vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m);                         \
-                                                                    \
-  vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m);                         \
-  cnst2_m = __msa_splati_h(coeff_m, 2);                             \
-  cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m);                        \
-  vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m);                         \
-                                                                    \
-  SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS);      \
-  PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m,       \
-              vec7_m, vec7_m, out0, out2, out1, out3);              \
-}
+#define VPX_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                  \
+  {                                                                            \
+    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                                  \
+    v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
+    v4i32 vec4_m, vec5_m, vec6_m, vec7_m;                                      \
+    v8i16 coeff_m = {                                                          \
+      cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
+    };                                                                         \
+                                                                               \
+    BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);           \
+    ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m);                \
+    SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m);                             \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
+    vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m);                                  \
+                                                                               \
+    SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m);                             \
+    cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m);                                 \
+    vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m);                                  \
+                                                                               \
+    vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m);                                  \
+    cnst2_m = __msa_splati_h(coeff_m, 2);                                      \
+    cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m);                                 \
+    vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m);                                  \
+                                                                               \
+    SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS);               \
+    PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m,        \
+                vec7_m, out0, out2, out1, out3);                               \
+  }
 
-#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) {        \
-  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
-                                                                         \
-  SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15);    \
-  SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15);    \
-  AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3,         \
-             in0, in1, in2, in3);                                        \
-  AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7,         \
-             in4, in5, in6, in7);                                        \
-}
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)              \
+  {                                                                          \
+    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+                                                                             \
+    SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15);      \
+    SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15);      \
+    AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
+               in2, in3);                                                    \
+    AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
+               in6, in7);                                                    \
+  }
 
-#define VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                  out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                    \
-  v8i16 s7_m, x0_m, x1_m, x2_m, x3_m;                                \
-  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,           \
-                    cospi_24_64, cospi_4_64, cospi_28_64,            \
-                    cospi_12_64, cospi_20_64 };                      \
-                                                                     \
-  /* FDCT stage1 */                                                  \
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                \
-              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);       \
-  BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);       \
-  ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                    \
-  ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                    \
-  SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                           \
-  x1_m = __msa_ilvev_h(x1_m, x0_m);                                  \
-  out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                    \
-                                                                     \
-  SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                           \
-  x2_m = -x2_m;                                                      \
-  x2_m = __msa_ilvev_h(x3_m, x2_m);                                  \
-  out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                    \
-                                                                     \
-  out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                    \
-  x2_m = __msa_splati_h(coeff_m, 2);                                 \
-  x2_m = __msa_ilvev_h(x2_m, x3_m);                                  \
-  out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                    \
-                                                                     \
-  /* stage2 */                                                       \
-  ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                               \
-                                                                     \
-  s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                    \
-  s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                    \
-                                                                     \
-  /* stage3 */                                                       \
-  BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);       \
-                                                                     \
-  /* stage4 */                                                       \
-  ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                    \
-  ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                    \
-                                                                     \
-  SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                           \
-  x1_m = __msa_ilvev_h(x0_m, x1_m);                                  \
-  out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                    \
-                                                                     \
-  SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                           \
-  x2_m = __msa_ilvev_h(x3_m, x2_m);                                  \
-  out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                    \
-                                                                     \
-  x1_m = __msa_splati_h(coeff_m, 5);                                 \
-  x0_m = -x0_m;                                                      \
-  x0_m = __msa_ilvev_h(x1_m, x0_m);                                  \
-  out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                    \
-                                                                     \
-  x2_m = __msa_splati_h(coeff_m, 6);                                 \
-  x3_m = -x3_m;                                                      \
-  x2_m = __msa_ilvev_h(x2_m, x3_m);                                  \
-  out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                    \
-}
+#define VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
+                  out3, out4, out5, out6, out7)                              \
+  {                                                                          \
+    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                          \
+    v8i16 s7_m, x0_m, x1_m, x2_m, x3_m;                                      \
+    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
+                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
+                                                                             \
+    /* FDCT stage1 */                                                        \
+    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
+                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
+    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
+    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
+    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
+    SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                 \
+    x1_m = __msa_ilvev_h(x1_m, x0_m);                                        \
+    out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
+                                                                             \
+    SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                 \
+    x2_m = -x2_m;                                                            \
+    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
+    out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
+                                                                             \
+    out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
+    x2_m = __msa_splati_h(coeff_m, 2);                                       \
+    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
+    out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
+                                                                             \
+    /* stage2 */                                                             \
+    ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                     \
+                                                                             \
+    s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
+    s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
+                                                                             \
+    /* stage3 */                                                             \
+    BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);             \
+                                                                             \
+    /* stage4 */                                                             \
+    ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                          \
+    ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                          \
+                                                                             \
+    SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                 \
+    x1_m = __msa_ilvev_h(x0_m, x1_m);                                        \
+    out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                          \
+                                                                             \
+    SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                 \
+    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
+    out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
+                                                                             \
+    x1_m = __msa_splati_h(coeff_m, 5);                                       \
+    x0_m = -x0_m;                                                            \
+    x0_m = __msa_ilvev_h(x1_m, x0_m);                                        \
+    out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                          \
+                                                                             \
+    x2_m = __msa_splati_h(coeff_m, 6);                                       \
+    x3_m = -x3_m;                                                            \
+    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
+    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
+  }
 
-#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7,                \
-                      out0, out1, out2, out3, out4, out5, out6, out7) {      \
-  v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                      \
-  v8i16 x0_m, x1_m, x2_m, x3_m;                                              \
-  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,      \
-                    cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 };     \
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
+                      out2, out3, out4, out5, out6, out7)                    \
+  {                                                                          \
+    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                    \
+    v8i16 x0_m, x1_m, x2_m, x3_m;                                            \
+    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
+                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
                                                                              \
-  /* FDCT stage1 */                                                          \
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                        \
-              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);               \
-  BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);               \
-  ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                            \
-  ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                            \
-  SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                   \
-  x1_m = __msa_ilvev_h(x1_m, x0_m);                                          \
-  out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                            \
+    /* FDCT stage1 */                                                        \
+    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
+                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
+    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
+    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
+    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
+    SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                 \
+    x1_m = __msa_ilvev_h(x1_m, x0_m);                                        \
+    out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
                                                                              \
-  SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                   \
-  x2_m = -x2_m;                                                              \
-  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
-  out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                            \
+    SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                 \
+    x2_m = -x2_m;                                                            \
+    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
+    out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
                                                                              \
-  out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                            \
-  x2_m = __msa_splati_h(coeff_m, 2);                                         \
-  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
-  out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                            \
+    out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
+    x2_m = __msa_splati_h(coeff_m, 2);                                       \
+    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
+    out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
                                                                              \
-  /* stage2 */                                                               \
-  ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                       \
+    /* stage2 */                                                             \
+    ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                     \
                                                                              \
-  s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                            \
-  s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                            \
+    s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
+    s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
                                                                              \
-  /* stage3 */                                                               \
-  BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);               \
+    /* stage3 */                                                             \
+    BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);             \
                                                                              \
-  /* stage4 */                                                               \
-  ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                            \
-  ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                            \
+    /* stage4 */                                                             \
+    ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                          \
+    ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                          \
                                                                              \
-  SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                   \
-  x1_m = __msa_ilvev_h(x0_m, x1_m);                                          \
-  out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                            \
+    SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                 \
+    x1_m = __msa_ilvev_h(x0_m, x1_m);                                        \
+    out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                          \
                                                                              \
-  SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                   \
-  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
-  out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                            \
+    SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                 \
+    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
+    out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
                                                                              \
-  x1_m = __msa_splati_h(coeff_m, 5);                                         \
-  x0_m = -x0_m;                                                              \
-  x0_m = __msa_ilvev_h(x1_m, x0_m);                                          \
-  out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                            \
+    x1_m = __msa_splati_h(coeff_m, 5);                                       \
+    x0_m = -x0_m;                                                            \
+    x0_m = __msa_ilvev_h(x1_m, x0_m);                                        \
+    out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                          \
                                                                              \
-  x2_m = __msa_splati_h(coeff_m, 6);                                         \
-  x3_m = -x3_m;                                                              \
-  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
-  out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                            \
-}
+    x2_m = __msa_splati_h(coeff_m, 6);                                       \
+    x3_m = -x3_m;                                                            \
+    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
+    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
+  }
 
-#define FDCT8x16_ODD(input0, input1, input2, input3,               \
-                     input4, input5, input6, input7,               \
-                     out1, out3, out5, out7,                       \
-                     out9, out11, out13, out15) {                  \
-  v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;      \
-  v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;      \
-  v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                          \
-  v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                    \
-  v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                        \
-  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,         \
-                    cospi_24_64, -cospi_8_64, -cospi_24_64,        \
-                    cospi_12_64, cospi_20_64 };                    \
-  v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64,         \
-                     cospi_18_64, cospi_10_64, cospi_22_64,        \
-                     cospi_6_64, cospi_26_64 };                    \
-  v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64,      \
-                     -cospi_26_64, 0, 0, 0, 0 };                   \
-                                                                   \
-  /* stp 1 */                                                      \
-  ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);      \
-  ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m);      \
-                                                                   \
-  cnst4_m = __msa_splati_h(coeff_m, 0);                            \
-  stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m);        \
-                                                                   \
-  cnst5_m = __msa_splati_h(coeff_m, 1);                            \
-  cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m);                       \
-  stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m);        \
-  stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m);        \
-  stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);        \
-                                                                   \
-  /* stp2 */                                                       \
-  BUTTERFLY_4(input0, input1, stp22_m, stp23_m,                    \
-              stp30_m, stp31_m, stp32_m, stp33_m);                 \
-  BUTTERFLY_4(input7, input6, stp25_m, stp24_m,                    \
-              stp37_m, stp36_m, stp35_m, stp34_m);                 \
-                                                                   \
-  ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);  \
-  ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);  \
-                                                                   \
-  SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m);                   \
-  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
-  stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);        \
-                                                                   \
-  cnst0_m = __msa_splati_h(coeff_m, 4);                            \
-  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
-  stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);        \
-                                                                   \
-  SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m);                   \
-  cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
-  stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);        \
-                                                                   \
-  cnst0_m = __msa_splati_h(coeff_m, 3);                            \
-  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
-  stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);        \
-                                                                   \
-  /* stp4 */                                                       \
-  BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m,                  \
-              vec6_m, vec2_m, vec4_m, vec5_m);                     \
-  BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m,                  \
-              stp21_m, stp23_m, stp24_m, stp31_m);                 \
-                                                                   \
-  ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                     \
-  SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                  \
-  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
-                                                                   \
-  out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
-                                                                   \
-  cnst0_m = __msa_splati_h(coeff2_m, 0);                           \
-  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
-  out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);          \
-                                                                   \
-  ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m);                     \
-  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                  \
-  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
-                                                                   \
-  out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);           \
-                                                                   \
-  cnst1_m = __msa_splati_h(coeff2_m, 2);                           \
-  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
-  out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
-                                                                   \
-  ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m);                   \
-  SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m);                  \
-  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
-  out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
-                                                                   \
-  cnst0_m = __msa_splati_h(coeff2_m, 1);                           \
-  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
-  out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);          \
-                                                                   \
-  ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m);                   \
-  SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m);                  \
-  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
-                                                                   \
-  out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);          \
-                                                                   \
-  cnst1_m = __msa_splati_h(coeff2_m, 3);                           \
-  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
-  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);           \
-}
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,   \
+                     input7, out1, out3, out5, out7, out9, out11, out13,       \
+                     out15)                                                    \
+  {                                                                            \
+    v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;                \
+    v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;                \
+    v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                                    \
+    v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                              \
+    v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                                  \
+    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,     \
+                      -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };   \
+    v8i16 coeff1_m = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,     \
+                       cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };   \
+    v8i16 coeff2_m = {                                                         \
+      -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0        \
+    };                                                                         \
+                                                                               \
+    /* stp 1 */                                                                \
+    ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);                \
+    ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m);                \
+                                                                               \
+    cnst4_m = __msa_splati_h(coeff_m, 0);                                      \
+    stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m);                  \
+                                                                               \
+    cnst5_m = __msa_splati_h(coeff_m, 1);                                      \
+    cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m);                                 \
+    stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m);                  \
+    stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m);                  \
+    stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);                  \
+                                                                               \
+    /* stp2 */                                                                 \
+    BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m,   \
+                stp33_m);                                                      \
+    BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m,   \
+                stp34_m);                                                      \
+                                                                               \
+    ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);            \
+    ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);            \
+                                                                               \
+    SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m);                             \
+    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
+    stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                  \
+                                                                               \
+    cnst0_m = __msa_splati_h(coeff_m, 4);                                      \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
+    stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                  \
+                                                                               \
+    SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m);                             \
+    cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
+    stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);                  \
+                                                                               \
+    cnst0_m = __msa_splati_h(coeff_m, 3);                                      \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
+    stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);                  \
+                                                                               \
+    /* stp4 */                                                                 \
+    BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m,    \
+                vec5_m);                                                       \
+    BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
+                stp31_m);                                                      \
+                                                                               \
+    ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                               \
+    SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                            \
+    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
+                                                                               \
+    out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
+                                                                               \
+    cnst0_m = __msa_splati_h(coeff2_m, 0);                                     \
+    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
+    out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
+                                                                               \
+    ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m);                               \
+    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                            \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
+                                                                               \
+    out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                     \
+                                                                               \
+    cnst1_m = __msa_splati_h(coeff2_m, 2);                                     \
+    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
+    out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
+                                                                               \
+    ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m);                             \
+    SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m);                            \
+    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
+    out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
+                                                                               \
+    cnst0_m = __msa_splati_h(coeff2_m, 1);                                     \
+    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
+    out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
+                                                                               \
+    ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m);                             \
+    SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m);                            \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
+                                                                               \
+    out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                    \
+                                                                               \
+    cnst1_m = __msa_splati_h(coeff2_m, 3);                                     \
+    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
+    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
+  }
 
-#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) {      \
-  v8i16 tp0_m, tp1_m;                             \
-  v8i16 one_m = __msa_ldi_h(1);                   \
-                                                  \
-  tp0_m = __msa_clti_s_h(vec0, 0);                \
-  tp1_m = __msa_clti_s_h(vec1, 0);                \
-  vec0 += 1;                                      \
-  vec1 += 1;                                      \
-  tp0_m = one_m & tp0_m;                          \
-  tp1_m = one_m & tp1_m;                          \
-  vec0 += tp0_m;                                  \
-  vec1 += tp1_m;                                  \
-  vec0 >>= 2;                                     \
-  vec1 >>= 2;                                     \
-}
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+  {                                        \
+    v8i16 tp0_m, tp1_m;                    \
+    v8i16 one_m = __msa_ldi_h(1);          \
+                                           \
+    tp0_m = __msa_clti_s_h(vec0, 0);       \
+    tp1_m = __msa_clti_s_h(vec1, 0);       \
+    vec0 += 1;                             \
+    vec1 += 1;                             \
+    tp0_m = one_m & tp0_m;                 \
+    tp1_m = one_m & tp1_m;                 \
+    vec0 += tp0_m;                         \
+    vec1 += tp1_m;                         \
+    vec0 >>= 2;                            \
+    vec1 >>= 2;                            \
+  }
 
-#define FDCT32_POSTPROC_NEG_W(vec) {      \
-  v4i32 temp_m;                           \
-  v4i32 one_m = __msa_ldi_w(1);           \
-                                          \
-  temp_m = __msa_clti_s_w(vec, 0);        \
-  vec += 1;                               \
-  temp_m = one_m & temp_m;                \
-  vec += temp_m;                          \
-  vec >>= 2;                              \
-}
+#define FDCT32_POSTPROC_NEG_W(vec)   \
+  {                                  \
+    v4i32 temp_m;                    \
+    v4i32 one_m = __msa_ldi_w(1);    \
+                                     \
+    temp_m = __msa_clti_s_w(vec, 0); \
+    vec += 1;                        \
+    temp_m = one_m & temp_m;         \
+    vec += temp_m;                   \
+    vec >>= 2;                       \
+  }
 
-#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) {      \
-  v8i16 tp0_m, tp1_m;                               \
-  v8i16 one = __msa_ldi_h(1);                       \
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1)        \
+  {                                                 \
+    v8i16 tp0_m, tp1_m;                             \
+    v8i16 one = __msa_ldi_h(1);                     \
                                                     \
-  tp0_m = __msa_clei_s_h(vec0, 0);                  \
-  tp1_m = __msa_clei_s_h(vec1, 0);                  \
-  tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255);   \
-  tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255);   \
-  vec0 += 1;                                        \
-  vec1 += 1;                                        \
-  tp0_m = one & tp0_m;                              \
-  tp1_m = one & tp1_m;                              \
-  vec0 += tp0_m;                                    \
-  vec1 += tp1_m;                                    \
-  vec0 >>= 2;                                       \
-  vec1 >>= 2;                                       \
-}
+    tp0_m = __msa_clei_s_h(vec0, 0);                \
+    tp1_m = __msa_clei_s_h(vec1, 0);                \
+    tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
+    tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
+    vec0 += 1;                                      \
+    vec1 += 1;                                      \
+    tp0_m = one & tp0_m;                            \
+    tp1_m = one & tp1_m;                            \
+    vec0 += tp0_m;                                  \
+    vec1 += tp1_m;                                  \
+    vec0 >>= 2;                                     \
+    vec1 >>= 2;                                     \
+  }
 
-#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right,      \
-                          reg1_right, const0, const1,            \
-                          out0, out1, out2, out3) {              \
-  v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;          \
-  v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                              \
-  v4i32 k0_m = __msa_fill_w((int32_t) const0);                   \
-                                                                 \
-  s0_m = __msa_fill_w((int32_t) const1);                         \
-  k0_m = __msa_ilvev_w(s0_m, k0_m);                              \
-                                                                 \
-  ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m);                \
-  ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m);                 \
-  ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m);              \
-  ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m);               \
-                                                                 \
-  DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m);             \
-  DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m);             \
-  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
-  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
-  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
-  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
-  out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
-  out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
-                                                                 \
-  DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m);             \
-  DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m);             \
-  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
-  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
-  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
-  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
-  out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
-  out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
-}
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
+                          const0, const1, out0, out1, out2, out3)       \
+  {                                                                     \
+    v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
+    v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                                   \
+    v4i32 k0_m = __msa_fill_w((int32_t)const0);                         \
+                                                                        \
+    s0_m = __msa_fill_w((int32_t)const1);                               \
+    k0_m = __msa_ilvev_w(s0_m, k0_m);                                   \
+                                                                        \
+    ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m);                     \
+    ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m);                      \
+    ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m);                   \
+    ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m);                    \
+                                                                        \
+    DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m);                  \
+    DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m);                  \
+    tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                       \
+    tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                       \
+    tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                       \
+    tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                       \
+    out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);                   \
+    out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);                   \
+                                                                        \
+    DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m);                  \
+    DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m);                  \
+    tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                       \
+    tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                       \
+    tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                       \
+    tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                       \
+    out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);                   \
+    out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);                   \
+  }
 
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride);
diff --git a/vpx_dsp/mips/idct16x16_msa.c b/vpx_dsp/mips/idct16x16_msa.c
index 6d403efa7f6f71050d01104b2fda0cfaab463ae9..1cbeb35ba5efeccc909e95fa6d5a33a2e6275f5a 100644
--- a/vpx_dsp/mips/idct16x16_msa.c
+++ b/vpx_dsp/mips/idct16x16_msa.c
@@ -20,10 +20,10 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
   input += 8;
   LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
 
-  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
-                     reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
-                     reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
+                     reg2, reg3, reg4, reg5, reg6, reg7);
+  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
+                     reg9, reg10, reg11, reg12, reg13, reg14, reg15);
   DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
   DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
   BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
@@ -93,13 +93,13 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
   reg3 = tmp7;
 
   /* transpose block */
-  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
-                     reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
+                     reg2, reg4, reg6, reg8, reg10, reg12, reg14);
   ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
 
   /* transpose block */
-  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
-                     reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
+                     reg13, reg11, reg5, reg7, reg9, reg1, reg15);
   ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
 }
 
@@ -233,7 +233,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
   /* short case just considers top 4 rows as valid output */
   out += 4 * 16;
   for (i = 12; i--;) {
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sw     $zero,   0(%[out])     \n\t"
         "sw     $zero,   4(%[out])     \n\t"
         "sw     $zero,   8(%[out])     \n\t"
@@ -244,8 +244,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
         "sw     $zero,  28(%[out])     \n\t"
 
         :
-        : [out] "r" (out)
-    );
+        : [out] "r"(out));
 
     out += 16;
   }
@@ -283,8 +282,8 @@ void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
     ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
     CLIP_SH4_0_255(res0, res1, res2, res3);
     CLIP_SH4_0_255(res4, res5, res6, res7);
-    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
-                tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+                tmp2, tmp3);
     ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
     dst += (4 * dst_stride);
   }
@@ -295,29 +294,28 @@ void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
   v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
 
   /* load input data */
-  LD_SH16(input, 8,
-          l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
-                     l0, l1, l2, l3, l4, l5, l6, l7);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
-                     l8, l9, l10, l11, l12, l13, l14, l15);
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+          l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
+                     l7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
+                     l12, l13, l14, l15);
 
   /* ADST in horizontal */
-  VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
-                   l8, l9, l10, l11, l12, l13, l14, l15,
-                   r0, r1, r2, r3, r4, r5, r6, r7,
-                   r8, r9, r10, r11, r12, r13, r14, r15);
+  VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
+                   l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
+                   r12, r13, r14, r15);
 
   l1 = -r8;
   l3 = -r4;
   l13 = -r13;
   l15 = -r1;
 
-  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2,
-                     l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
+                     l6, l7);
   ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
-  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15,
-                     l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
+                     l13, l14, l15);
   ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
 }
 
diff --git a/vpx_dsp/mips/idct32x32_msa.c b/vpx_dsp/mips/idct32x32_msa.c
index de47597a8d54e1ffed2ccb99c81328210549c2f2..ed5cef18a9c0426dc008d3946e27adb8d7ce44b0 100644
--- a/vpx_dsp/mips/idct32x32_msa.c
+++ b/vpx_dsp/mips/idct32x32_msa.c
@@ -17,10 +17,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
   /* 1st & 2nd 8x8 */
   LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
   LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
-                     m0, n0, m1, n1, m2, n2, m3, n3);
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
-                     m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
   ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
   ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
   ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
@@ -28,10 +28,10 @@ static void idct32x8_row_transpose_store(const int16_t *input,
   /* 3rd & 4th 8x8 */
   LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
   LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
-                     m0, n0, m1, n1, m2, n2, m3, n3);
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
-                     m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
   ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
   ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
   ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
@@ -186,8 +186,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
   DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
 
   /* 4 Stores */
-  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
-       vec0, vec1, vec2, vec3);
+  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
   DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
   DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
 
@@ -198,8 +197,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
   ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
 
   /* 4 Stores */
-  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4,
-       vec1, vec2, vec0, vec3);
+  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
   BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
   ST_SH(reg0, (tmp_odd_buf + 13 * 8));
   ST_SH(reg1, (tmp_odd_buf + 14 * 8));
@@ -213,8 +211,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
   LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
   LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
 
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
-       loc0, loc1, loc2, loc3);
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
   ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
 
   SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
@@ -228,8 +225,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
   LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
   LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
 
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
-       loc0, loc1, loc2, loc3);
+  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
   ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
 
   SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
@@ -242,8 +238,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
 
 static void idct_butterfly_transpose_store(int16_t *tmp_buf,
                                            int16_t *tmp_eve_buf,
-                                           int16_t *tmp_odd_buf,
-                                           int16_t *dst) {
+                                           int16_t *tmp_odd_buf, int16_t *dst) {
   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
 
@@ -317,26 +312,26 @@ static void idct_butterfly_transpose_store(int16_t *tmp_buf,
 
   /* Transpose : 16 vectors */
   /* 1st & 2nd 8x8 */
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
-                     m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
   ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
   ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
 
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
-                     m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
   ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
   ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
 
   /* 3rd & 4th 8x8 */
   LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
   LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3,
-                     m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
   ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
   ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
 
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7,
-                     m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
   ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
   ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
 }
@@ -349,8 +344,8 @@ static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
   idct32x8_row_transpose_store(input, &tmp_buf[0]);
   idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
   idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
-  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
-                                 &tmp_odd_buf[0], output);
+  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+                                 output);
 }
 
 static void idct8x32_column_even_process_store(int16_t *tmp_buf,
@@ -541,8 +536,7 @@ static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
 }
 
 static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
-                                             int16_t *tmp_odd_buf,
-                                             uint8_t *dst,
+                                             int16_t *tmp_odd_buf, uint8_t *dst,
                                              int32_t dst_stride) {
   v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
   v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
@@ -563,8 +557,8 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 
   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
   SRARI_H4_SH(m0, m2, m4, m6, 6);
-  VPX_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride),
-                      m0, m2, m4, m6);
+  VPX_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
+                      m6);
 
   /* Load 8 & Store 8 */
   vec0 = LD_SH(tmp_odd_buf + 4 * 8);
@@ -578,13 +572,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 
   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
   SRARI_H4_SH(m1, m3, m5, m7, 6);
-  VPX_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride),
-                      m1, m3, m5, m7);
+  VPX_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
 
   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
   SRARI_H4_SH(m1, m3, m5, m7, 6);
-  VPX_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride),
-                      m1, m3, m5, m7);
+  VPX_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
+                      m7);
 
   /* Load 8 & Store 8 */
   vec0 = LD_SH(tmp_odd_buf + 2 * 8);
@@ -598,13 +591,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 
   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
   SRARI_H4_SH(n0, n2, n4, n6, 6);
-  VPX_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride),
-                      n0, n2, n4, n6);
+  VPX_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
 
   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
   SRARI_H4_SH(n0, n2, n4, n6, 6);
-  VPX_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride),
-                      n0, n2, n4, n6);
+  VPX_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
+                      n6);
 
   /* Load 8 & Store 8 */
   vec0 = LD_SH(tmp_odd_buf + 5 * 8);
@@ -618,13 +610,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
 
   ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
   SRARI_H4_SH(n1, n3, n5, n7, 6);
-  VPX_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride),
-                      n1, n3, n5, n7);
+  VPX_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
 
   SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
   SRARI_H4_SH(n1, n3, n5, n7, 6);
-  VPX_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride),
-                      n1, n3, n5, n7);
+  VPX_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
+                      n7);
 }
 
 static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
@@ -634,8 +625,8 @@ static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
 
   idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
   idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
-  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
-                                   dst, dst_stride);
+  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+                                   dst_stride);
 }
 
 void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
@@ -665,7 +656,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
   int16_t *out_ptr = out_arr;
 
   for (i = 32; i--;) {
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sw     $zero,      0(%[out_ptr])     \n\t"
         "sw     $zero,      4(%[out_ptr])     \n\t"
         "sw     $zero,      8(%[out_ptr])     \n\t"
@@ -684,8 +675,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
         "sw     $zero,     60(%[out_ptr])     \n\t"
 
         :
-        : [out_ptr] "r" (out_ptr)
-    );
+        : [out_ptr] "r"(out_ptr));
 
     out_ptr += 32;
   }
@@ -728,8 +718,8 @@ void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
     ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
     CLIP_SH4_0_255(res0, res1, res2, res3);
     CLIP_SH4_0_255(res4, res5, res6, res7);
-    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3,
-                tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
+                tmp2, tmp3);
 
     ST_UB2(tmp0, tmp1, dst, 16);
     dst += dst_stride;
diff --git a/vpx_dsp/mips/idct4x4_msa.c b/vpx_dsp/mips/idct4x4_msa.c
index 04064f87dfef555d140eba00b3f1e2aa06b379b7..50e824850d0c1f9f44ce7ba4a1b353a432231671 100644
--- a/vpx_dsp/mips/idct4x4_msa.c
+++ b/vpx_dsp/mips/idct4x4_msa.c
@@ -42,8 +42,8 @@ void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
   in0_r -= in3_r;
   in2_r += in1_r;
 
-  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r,
-              in0, in1, in2, in3);
+  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
+              in2, in3);
   ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
 }
 
diff --git a/vpx_dsp/mips/idct8x8_msa.c b/vpx_dsp/mips/idct8x8_msa.c
index 6a24935fffaec50d61f0c90cfdc99c3b622fb7bd..c06330b027f93b8df30f99e4b87df8a5d21eafde 100644
--- a/vpx_dsp/mips/idct8x8_msa.c
+++ b/vpx_dsp/mips/idct8x8_msa.c
@@ -18,17 +18,17 @@ void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
   LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
 
   /* rows transform */
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
   /* 1D idct8x8 */
-  VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
-                 in0, in1, in2, in3, in4, in5, in6, in7);
+  VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                 in4, in5, in6, in7);
   /* columns transform */
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
   /* 1D idct8x8 */
-  VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
-                 in0, in1, in2, in3, in4, in5, in6, in7);
+  VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                 in4, in5, in6, in7);
   /* final rounding (add 2^4, divide by 2^5) and shift */
   SRARI_H4_SH(in0, in1, in2, in3, 5);
   SRARI_H4_SH(in4, in5, in6, in7, 5);
@@ -82,12 +82,12 @@ void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
   PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
 
   /* stage4 */
-  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7,
-              in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
-                     in0, in1, in2, in3, in4, in5, in6, in7);
-  VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,
-                 in0, in1, in2, in3, in4, in5, in6, in7);
+  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
+              in7);
+  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                 in4, in5, in6, in7);
 
   /* final rounding (add 2^4, divide by 2^5) and shift */
   SRARI_H4_SH(in0, in1, in2, in3, 5);
diff --git a/vpx_dsp/mips/intrapred16_dspr2.c b/vpx_dsp/mips/intrapred16_dspr2.c
index 11444c718e7b4ccc8687e7c54bd42b41173e8dac..3e29d0ac39f37fabee210ccdb953332d78fe6e6e 100644
--- a/vpx_dsp/mips/intrapred16_dspr2.c
+++ b/vpx_dsp/mips/intrapred16_dspr2.c
@@ -13,10 +13,10 @@
 #if HAVE_DSPR2
 void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
-  int32_t  tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
       "lb         %[tmp2],      1(%[left])                   \n\t"
       "lb         %[tmp3],      2(%[left])                   \n\t"
@@ -146,26 +146,23 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sw         %[tmp16],     8(%[dst])                    \n\t"
       "sw         %[tmp16],     12(%[dst])                   \n\t"
 
-      : [tmp1] "=&r" (tmp1),   [tmp2] "=&r" (tmp2),
-        [tmp3] "=&r" (tmp3),   [tmp4] "=&r" (tmp4),
-        [tmp5] "=&r" (tmp5),   [tmp7] "=&r" (tmp7),
-        [tmp6] "=&r" (tmp6),   [tmp8] "=&r" (tmp8),
-        [tmp9] "=&r" (tmp9),   [tmp10] "=&r" (tmp10),
-        [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
-        [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
-        [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
-      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
-  );
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
+        [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
+        [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
+        [tmp16] "=&r"(tmp16)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
 }
 
 void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  int32_t  expected_dc;
-  int32_t  average;
-  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
-  int32_t  above2, left2;
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t above2, left2;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "lw              %[above1],           (%[above])                    \n\t"
       "lw              %[above2],           4(%[above])                   \n\t"
       "lw              %[left1],            (%[left])                     \n\t"
@@ -316,14 +313,12 @@ void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sw              %[expected_dc],      8(%[dst])                     \n\t"
       "sw              %[expected_dc],      12(%[dst])                    \n\t"
 
-      : [left1] "=&r" (left1), [above1] "=&r" (above1),
-        [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
-        [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
-        [above2] "=&r" (above2), [left2] "=&r" (left2),
-        [average] "=&r" (average), [tmp] "=&r" (tmp),
-        [expected_dc] "=&r" (expected_dc)
-      : [above] "r" (above), [left] "r" (left),
-        [dst] "r" (dst), [stride] "r" (stride)
-  );
+      : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
+        [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
+        [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
+        [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
 }
 #endif  // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/intrapred4_dspr2.c b/vpx_dsp/mips/intrapred4_dspr2.c
index 03baf4c9cc81019ba6960e299414831da42fc131..9f51d50c752f820b4a90fd65636375799977d6de 100644
--- a/vpx_dsp/mips/intrapred4_dspr2.c
+++ b/vpx_dsp/mips/intrapred4_dspr2.c
@@ -13,9 +13,9 @@
 #if HAVE_DSPR2
 void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int32_t  tmp1, tmp2, tmp3, tmp4;
+  int32_t tmp1, tmp2, tmp3, tmp4;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
       "lb         %[tmp2],      1(%[left])                   \n\t"
       "lb         %[tmp3],      2(%[left])                   \n\t"
@@ -32,19 +32,18 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
       "add        %[dst],       %[dst],         %[stride]    \n\t"
       "sw         %[tmp4],      (%[dst])                     \n\t"
 
-      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
-        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
-      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
-  );
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
 }
 
 void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int32_t  expected_dc;
-  int32_t  average;
-  int32_t  tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "lw              %[above_c],         (%[above])                    \n\t"
       "lw              %[left_c],          (%[left])                     \n\t"
 
@@ -70,27 +69,26 @@ void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
       "add             %[dst],              %[dst],          %[stride]   \n\t"
       "sw              %[expected_dc],     (%[dst])                      \n\t"
 
-      : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l),
-        [above_r] "=&r" (above_r), [left_c] "=&r" (left_c),
-        [left_l] "=&r" (left_l), [left_r] "=&r" (left_r),
-        [average] "=&r" (average), [tmp] "=&r" (tmp),
-        [expected_dc] "=&r" (expected_dc)
-      : [above] "r" (above), [left] "r" (left),
-        [dst] "r" (dst), [stride] "r" (stride)
-  );
+      : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
+        [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
+        [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
+        [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
 }
 
 void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int32_t  abovel, abover;
-  int32_t  left0, left1, left2, left3;
-  int32_t  res0, res1;
-  int32_t  resl;
-  int32_t  resr;
-  int32_t  top_left;
-  uint8_t  *cm = vpx_ff_cropTbl;
-
-  __asm__ __volatile__ (
+  int32_t abovel, abover;
+  int32_t left0, left1, left2, left3;
+  int32_t res0, res1;
+  int32_t resl;
+  int32_t resr;
+  int32_t top_left;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  __asm__ __volatile__(
       "ulw             %[resl],       (%[above])                         \n\t"
 
       "lbu             %[left0],       (%[left])                         \n\t"
@@ -174,7 +172,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sra             %[res0],        %[res0],           16             \n\t"
       "lbux            %[res0],        %[res0](%[cm])                    \n\t"
 
-
       "sra             %[res1],        %[resr],           16             \n\t"
       "lbux            %[res1],        %[res1](%[cm])                    \n\t"
       "sb              %[res0],        (%[dst])                          \n\t"
@@ -183,7 +180,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sra             %[res0],        %[res0],           16             \n\t"
       "lbux            %[res0],        %[res0](%[cm])                    \n\t"
 
-
       "sb              %[res1],        1(%[dst])                         \n\t"
       "sra             %[res1],        %[resl],           16             \n\t"
       "lbux            %[res1],        %[res1](%[cm])                    \n\t"
@@ -218,12 +214,11 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sb              %[res0],        2(%[dst])                         \n\t"
       "sb              %[res1],        3(%[dst])                         \n\t"
 
-      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
-        [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
-        [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
-        [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
-      : [above] "r" (above), [left] "r" (left),
-        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
-  );
+      : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
+        [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
+        [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
+        [resr] "=&r"(resr), [top_left] "=&r"(top_left)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride), [cm] "r"(cm));
 }
 #endif  // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/intrapred8_dspr2.c b/vpx_dsp/mips/intrapred8_dspr2.c
index 196ff5a062ef53ee03f1a769f534bb91ff042810..eac79d51000b0afb5fc8c2b023282eeb45ed17f2 100644
--- a/vpx_dsp/mips/intrapred8_dspr2.c
+++ b/vpx_dsp/mips/intrapred8_dspr2.c
@@ -13,9 +13,9 @@
 #if HAVE_DSPR2
 void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                   \n\t"
       "lb         %[tmp2],      1(%[left])                  \n\t"
       "lb         %[tmp3],      2(%[left])                  \n\t"
@@ -58,23 +58,20 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sw         %[tmp8],      (%[dst])                    \n\t"
       "sw         %[tmp8],      4(%[dst])                   \n\t"
 
-      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
-        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
-        [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
-        [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8)
-      : [left] "r" (left), [dst] "r" (dst),
-        [stride] "r" (stride)
-  );
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
 }
 
 void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int32_t  expected_dc;
-  int32_t  average;
-  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
-  int32_t  above2, above_l2, above_r2, left2, left_r2, left_l2;
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "lw              %[above1],         (%[above])                      \n\t"
       "lw              %[above2],         4(%[above])                     \n\t"
       "lw              %[left1],          (%[left])                       \n\t"
@@ -137,30 +134,29 @@ void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sw              %[expected_dc],    (%[dst])                        \n\t"
       "sw              %[expected_dc],    4(%[dst])                       \n\t"
 
-      : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1),
-        [above_r1] "=&r" (above_r1), [left1] "=&r" (left1),
-        [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1),
-        [above2] "=&r" (above2), [above_l2] "=&r" (above_l2),
-        [above_r2] "=&r" (above_r2), [left2] "=&r" (left2),
-        [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2),
-        [average] "=&r" (average), [tmp] "=&r" (tmp),
-        [expected_dc] "=&r" (expected_dc)
-      : [above] "r" (above), [left] "r" (left), [dst] "r" (dst),
-        [stride] "r" (stride)
-  );
+      : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
+        [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
+        [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
+        [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
+        [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
+        [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
+        [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
 }
 
 void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int32_t   abovel, abover;
-  int32_t   abovel_1, abover_1;
-  int32_t   left0;
-  int32_t   res0, res1, res2, res3;
-  int32_t   reshw;
-  int32_t   top_left;
-  uint8_t   *cm = vpx_ff_cropTbl;
-
-  __asm__ __volatile__ (
+  int32_t abovel, abover;
+  int32_t abovel_1, abover_1;
+  int32_t left0;
+  int32_t res0, res1, res2, res3;
+  int32_t reshw;
+  int32_t top_left;
+  uint8_t *cm = vpx_ff_cropTbl;
+
+  __asm__ __volatile__(
       "ulw             %[reshw],       (%[above])                         \n\t"
       "ulw             %[top_left],    4(%[above])                        \n\t"
 
@@ -595,13 +591,12 @@ void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
       "sb              %[res2],        6(%[dst])                          \n\t"
       "sb              %[res3],        7(%[dst])                          \n\t"
 
-      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
-        [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1),
-        [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3),
-        [res0] "=&r" (res0), [res1] "=&r" (res1),
-        [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
-      : [above] "r" (above), [left] "r" (left),
-        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
-  );
+      : [abovel] "=&r"(abovel), [abover] "=&r"(abover),
+        [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1),
+        [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3),
+        [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw),
+        [top_left] "=&r"(top_left)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride), [cm] "r"(cm));
 }
 #endif  // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/intrapred_msa.c b/vpx_dsp/mips/intrapred_msa.c
index f6fbe4016257c93a8cac7a071e7205c031828015..b5ee943031a0982bbff270d62afe9b8346a400ef 100644
--- a/vpx_dsp/mips/intrapred_msa.c
+++ b/vpx_dsp/mips/intrapred_msa.c
@@ -11,10 +11,11 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
-#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
-  out0 = __msa_subs_u_h(out0, in0);                \
-  out1 = __msa_subs_u_h(out1, in1);                \
-}
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
+  {                                             \
+    out0 = __msa_subs_u_h(out0, in0);           \
+    out1 = __msa_subs_u_h(out1, in1);           \
+  }
 
 static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
                                        int32_t dst_stride) {
@@ -150,8 +151,8 @@ static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
 }
 
 static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left,
-                                     uint8_t *dst, int32_t dst_stride) {
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
   uint32_t val0, val1;
   v16i8 store, src = { 0 };
   v8u16 sum_h;
@@ -199,8 +200,8 @@ static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
 }
 
 static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left,
-                                     uint8_t *dst, int32_t dst_stride) {
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
   uint64_t val0, val1;
   v16i8 store;
   v16u8 src = { 0 };
@@ -260,8 +261,8 @@ static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
 }
 
 static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left,
-                                       uint8_t *dst, int32_t dst_stride) {
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
   v16u8 top, left, out;
   v8u16 sum_h, sum_top, sum_left;
   v4u32 sum_w;
@@ -313,8 +314,8 @@ static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
 }
 
 static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left,
-                                       uint8_t *dst, int32_t dst_stride) {
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
   uint32_t row;
   v16u8 top0, top1, left0, left1, out;
   v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
@@ -381,8 +382,8 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
 }
 
 static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
-                                     const uint8_t *src_left,
-                                     uint8_t *dst, int32_t dst_stride) {
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
   uint32_t val;
   uint8_t top_left = src_top_ptr[-1];
   v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
@@ -409,8 +410,8 @@ static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
 }
 
 static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
-                                     const uint8_t *src_left,
-                                     uint8_t *dst, int32_t dst_stride) {
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
   uint64_t val;
   uint8_t top_left = src_top_ptr[-1];
   uint32_t loop_cnt;
@@ -442,8 +443,8 @@ static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
 }
 
 static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
-                                       const uint8_t *src_left,
-                                       uint8_t *dst, int32_t dst_stride) {
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
   uint8_t top_left = src_top_ptr[-1];
   uint32_t loop_cnt;
   v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
@@ -491,8 +492,8 @@ static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
 }
 
 static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left,
-                                       uint8_t *dst, int32_t dst_stride) {
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
   uint8_t top_left = src_top[-1];
   uint32_t loop_cnt;
   v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
diff --git a/vpx_dsp/mips/inv_txfm_dspr2.h b/vpx_dsp/mips/inv_txfm_dspr2.h
index abd85091188eb371cc2375b26665c3cd9e95e118..edd54aec5e20378d0cda7fe7b371ac8d396f67dd 100644
--- a/vpx_dsp/mips/inv_txfm_dspr2.h
+++ b/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -23,31 +23,39 @@ extern "C" {
 #endif
 
 #if HAVE_DSPR2
-#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
+#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                         \
+  ({                                                                           \
                                                                                \
-  int32_t tmp, out;                                                            \
-  int     dct_cost_rounding = DCT_CONST_ROUNDING;                              \
-  int     in = input;                                                          \
+    int32_t tmp, out;                                                          \
+    int dct_cost_rounding = DCT_CONST_ROUNDING;                                \
+    int in = input;                                                            \
                                                                                \
-  __asm__ __volatile__ (                                                       \
-      /* out = dct_const_round_shift(input_dc * cospi_16_64); */               \
-      "mtlo     %[dct_cost_rounding],   $ac1                              \n\t"\
-      "mthi     $zero,                  $ac1                              \n\t"\
-      "madd     $ac1,                   %[in],            %[cospi_16_64]  \n\t"\
-      "extp     %[tmp],                 $ac1,             31              \n\t"\
+    __asm__ __volatile__(/* out = dct_const_round_shift(dc *  cospi_16_64); */ \
+                         "mtlo     %[dct_cost_rounding],   $ac1              " \
+                         "                \n\t"                                \
+                         "mthi     $zero,                  $ac1              " \
+                         "                \n\t"                                \
+                         "madd     $ac1,                   %[in],            " \
+                         "%[cospi_16_64]  \n\t"                                \
+                         "extp     %[tmp],                 $ac1,             " \
+                         "31              \n\t"                                \
                                                                                \
-      /* out = dct_const_round_shift(out * cospi_16_64); */                    \
-      "mtlo     %[dct_cost_rounding],   $ac2                              \n\t"\
-      "mthi     $zero,                  $ac2                              \n\t"\
-      "madd     $ac2,                   %[tmp],           %[cospi_16_64]  \n\t"\
-      "extp     %[out],                 $ac2,             31              \n\t"\
+                         /* out = dct_const_round_shift(out * cospi_16_64); */ \
+                         "mtlo     %[dct_cost_rounding],   $ac2              " \
+                         "                \n\t"                                \
+                         "mthi     $zero,                  $ac2              " \
+                         "                \n\t"                                \
+                         "madd     $ac2,                   %[tmp],           " \
+                         "%[cospi_16_64]  \n\t"                                \
+                         "extp     %[out],                 $ac2,             " \
+                         "31              \n\t"                                \
                                                                                \
-      : [tmp] "=&r" (tmp), [out] "=r" (out)                                    \
-      : [in] "r" (in),                                                         \
-        [dct_cost_rounding] "r" (dct_cost_rounding),                           \
-        [cospi_16_64] "r" (cospi_16_64)                                        \
-   );                                                                          \
-  out;                                                                    })
+                         : [tmp] "=&r"(tmp), [out] "=r"(out)                   \
+                         : [in] "r"(in),                                       \
+                           [dct_cost_rounding] "r"(dct_cost_rounding),         \
+                           [cospi_16_64] "r"(cospi_16_64));                    \
+    out;                                                                       \
+  })
 
 void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                    int dest_stride);
@@ -59,10 +67,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
 void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                  int dest_stride);
 void iadst8_dspr2(const int16_t *input, int16_t *output);
-void idct16_rows_dspr2(const int16_t *input, int16_t *output,
-                       uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                               int dest_stride);
+void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
 void iadst16_dspr2(const int16_t *input, int16_t *output);
 
 #endif  // #if HAVE_DSPR2
diff --git a/vpx_dsp/mips/inv_txfm_msa.h b/vpx_dsp/mips/inv_txfm_msa.h
index 303fb3ea67368745052dfaa8d3637191f30b3124..ee94782c9a45f90353027e0f570922199338b87c 100644
--- a/vpx_dsp/mips/inv_txfm_msa.h
+++ b/vpx_dsp/mips/inv_txfm_msa.h
@@ -15,391 +15,392 @@
 #include "vpx_dsp/mips/txfm_macros_msa.h"
 #include "vpx_dsp/txfm_common.h"
 
-#define VPX_ADST8(in0, in1, in2, in3, in4, in5, in6, in7,               \
-                  out0, out1, out2, out3, out4, out5, out6, out7) {     \
-  v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                    \
-  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                     \
-  v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64,  \
-    cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 };               \
-  v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64,              \
-    -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 };                    \
-                                                                        \
-  SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                       \
-  cnst2_m = -cnst0_m;                                                   \
-  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
-  SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                       \
-  cnst4_m = -cnst2_m;                                                   \
-  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
-                                                                        \
-  ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                \
-  ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                \
-  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
-                        cnst1_m, cnst2_m, cnst3_m, in7, in0,            \
-                        in4, in3);                                      \
-                                                                        \
-  SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                       \
-  cnst2_m = -cnst0_m;                                                   \
-  ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);    \
-  SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                       \
-  cnst4_m = -cnst2_m;                                                   \
-  ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);    \
-                                                                        \
-  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
-  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
-                                                                        \
-  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
-                        cnst1_m, cnst2_m, cnst3_m, in5, in2,            \
-                        in6, in1);                                      \
-  BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                \
-  out7 = -s0_m;                                                         \
-  out0 = s1_m;                                                          \
-                                                                        \
-  SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5,                                    \
-               cnst0_m, cnst1_m, cnst2_m, cnst3_m);                     \
-                                                                        \
-  ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);    \
-  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
-  cnst1_m = cnst0_m;                                                    \
-                                                                        \
-  ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                \
-  ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                \
-  DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m,        \
-                        cnst2_m, cnst3_m, cnst1_m, out1, out6,          \
-                        s0_m, s1_m);                                    \
-                                                                        \
-  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                       \
-  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                            \
-                                                                        \
-  ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                \
-  ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                              \
-  out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                \
-  out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                \
-  out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                \
-  out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                \
-                                                                        \
-  out1 = -out1;                                                         \
-  out3 = -out3;                                                         \
-  out5 = -out5;                                                         \
-}
+#define VPX_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
+                  out3, out4, out5, out6, out7)                              \
+  {                                                                          \
+    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
+    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
+    v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
+                       cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
+    v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
+                       cospi_24_64, -cospi_24_64, 0,           0 };          \
+                                                                             \
+    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
+    cnst2_m = -cnst0_m;                                                      \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
+    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                          \
+    cnst4_m = -cnst2_m;                                                      \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
+                                                                             \
+    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
+    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
+                          cnst2_m, cnst3_m, in7, in0, in4, in3);             \
+                                                                             \
+    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
+    cnst2_m = -cnst0_m;                                                      \
+    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
+    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                          \
+    cnst4_m = -cnst2_m;                                                      \
+    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
+                                                                             \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
+                                                                             \
+    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
+                          cnst2_m, cnst3_m, in5, in2, in6, in1);             \
+    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
+    out7 = -s0_m;                                                            \
+    out0 = s1_m;                                                             \
+                                                                             \
+    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
+                                                                             \
+    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
+    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
+    cnst1_m = cnst0_m;                                                       \
+                                                                             \
+    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
+    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
+                          cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
+                                                                             \
+    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
+    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
+                                                                             \
+    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
+    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                 \
+    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                   \
+    out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                   \
+    out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                   \
+    out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                   \
+                                                                             \
+    out1 = -out1;                                                            \
+    out3 = -out3;                                                            \
+    out5 = -out5;                                                            \
+  }
 
-#define VPX_SET_COSPI_PAIR(c0_h, c1_h) ({  \
-  v8i16 out0_m, r0_m, r1_m;                \
-                                           \
-  r0_m = __msa_fill_h(c0_h);               \
-  r1_m = __msa_fill_h(c1_h);               \
-  out0_m = __msa_ilvev_h(r1_m, r0_m);      \
-                                           \
-  out0_m;                                  \
-})
+#define VPX_SET_COSPI_PAIR(c0_h, c1_h)  \
+  ({                                    \
+    v8i16 out0_m, r0_m, r1_m;           \
+                                        \
+    r0_m = __msa_fill_h(c0_h);          \
+    r1_m = __msa_fill_h(c1_h);          \
+    out0_m = __msa_ilvev_h(r1_m, r0_m); \
+                                        \
+    out0_m;                             \
+  })
 
-#define VPX_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) {  \
-  uint8_t *dst_m = (uint8_t *) (dst);                               \
-  v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                             \
-  v16i8 tmp0_m, tmp1_m;                                             \
-  v16i8 zero_m = { 0 };                                             \
-  v8i16 res0_m, res1_m, res2_m, res3_m;                             \
-                                                                    \
-  LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);        \
-  ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m,        \
-             zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m);       \
-  ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3,          \
-       res0_m, res1_m, res2_m, res3_m);                             \
-  CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                   \
-  PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);      \
-  ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                      \
-}
+#define VPX_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)               \
+  {                                                                            \
+    uint8_t *dst_m = (uint8_t *)(dst);                                         \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                                      \
+    v16i8 tmp0_m, tmp1_m;                                                      \
+    v16i8 zero_m = { 0 };                                                      \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
+                                                                               \
+    LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);                 \
+    ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
+               res0_m, res1_m, res2_m, res3_m);                                \
+    ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m,   \
+         res2_m, res3_m);                                                      \
+    CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                            \
+    PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);               \
+    ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                               \
+  }
 
-#define VPX_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) {   \
-  v8i16 c0_m, c1_m, c2_m, c3_m;                                     \
-  v8i16 step0_m, step1_m;                                           \
-  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
-                                                                    \
-  c0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
-  c1_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
-  step0_m = __msa_ilvr_h(in2, in0);                                 \
-  DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);        \
-                                                                    \
-  c2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
-  c3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
-  step1_m = __msa_ilvr_h(in3, in1);                                 \
-  DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);        \
-  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);      \
-                                                                    \
-  PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);      \
-  SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                  \
-  BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m,                         \
-              (v8i16)tmp2_m, (v8i16)tmp3_m,                         \
-              out0, out1, out2, out3);                              \
-}
+#define VPX_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)             \
+  {                                                                         \
+    v8i16 c0_m, c1_m, c2_m, c3_m;                                           \
+    v8i16 step0_m, step1_m;                                                 \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                            \
+    c0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                    \
+    c1_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                   \
+    step0_m = __msa_ilvr_h(in2, in0);                                       \
+    DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);              \
+                                                                            \
+    c2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                    \
+    c3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                     \
+    step1_m = __msa_ilvr_h(in3, in1);                                       \
+    DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);              \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);            \
+                                                                            \
+    PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);            \
+    SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                        \
+    BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
+                out0, out1, out2, out3);                                    \
+  }
 
-#define VPX_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  v8i16 res0_m, res1_m, c0_m, c1_m;                                 \
-  v8i16 k1_m, k2_m, k3_m, k4_m;                                     \
-  v8i16 zero_m = { 0 };                                             \
-  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                             \
-  v4i32 int0_m, int1_m, int2_m, int3_m;                             \
-  v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9,                 \
-    sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9,                  \
-    -sinpi_4_9 };                                                   \
-                                                                    \
-  SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);         \
-  ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                  \
-  ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
-  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);          \
-  int0_m = tmp2_m + tmp1_m;                                         \
-                                                                    \
-  SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                           \
-  ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                  \
-  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
-  int1_m = tmp0_m + tmp1_m;                                         \
-                                                                    \
-  c0_m = __msa_splati_h(mask_m, 6);                                 \
-  ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                 \
-  ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                   \
-  DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);          \
-  int2_m = tmp0_m + tmp1_m;                                         \
-                                                                    \
-  c0_m = __msa_splati_h(mask_m, 6);                                 \
-  c0_m = __msa_ilvev_h(c0_m, k1_m);                                 \
-                                                                    \
-  res0_m = __msa_ilvr_h((in1), (in3));                              \
-  tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                            \
-  int3_m = tmp2_m + tmp0_m;                                         \
-                                                                    \
-  res0_m = __msa_ilvr_h((in2), (in3));                              \
-  c1_m = __msa_ilvev_h(k4_m, k3_m);                                 \
-                                                                    \
-  tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                            \
-  res1_m = __msa_ilvr_h((in0), (in2));                              \
-  c1_m = __msa_ilvev_h(k1_m, zero_m);                               \
-                                                                    \
-  tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                            \
-  int3_m += tmp2_m;                                                 \
-  int3_m += tmp3_m;                                                 \
-                                                                    \
-  SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);      \
-  PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);          \
-  PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);          \
-}
+#define VPX_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
+  {                                                                    \
+    v8i16 res0_m, res1_m, c0_m, c1_m;                                  \
+    v8i16 k1_m, k2_m, k3_m, k4_m;                                      \
+    v8i16 zero_m = { 0 };                                              \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
+    v4i32 int0_m, int1_m, int2_m, int3_m;                              \
+    v8i16 mask_m = { sinpi_1_9,  sinpi_2_9,  sinpi_3_9,  sinpi_4_9,    \
+                     -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
+                                                                       \
+    SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);          \
+    ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                   \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);           \
+    int0_m = tmp2_m + tmp1_m;                                          \
+                                                                       \
+    SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                            \
+    ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                   \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
+    int1_m = tmp0_m + tmp1_m;                                          \
+                                                                       \
+    c0_m = __msa_splati_h(mask_m, 6);                                  \
+    ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                  \
+    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
+    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
+    int2_m = tmp0_m + tmp1_m;                                          \
+                                                                       \
+    c0_m = __msa_splati_h(mask_m, 6);                                  \
+    c0_m = __msa_ilvev_h(c0_m, k1_m);                                  \
+                                                                       \
+    res0_m = __msa_ilvr_h((in1), (in3));                               \
+    tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                             \
+    int3_m = tmp2_m + tmp0_m;                                          \
+                                                                       \
+    res0_m = __msa_ilvr_h((in2), (in3));                               \
+    c1_m = __msa_ilvev_h(k4_m, k3_m);                                  \
+                                                                       \
+    tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                             \
+    res1_m = __msa_ilvr_h((in0), (in2));                               \
+    c1_m = __msa_ilvev_h(k1_m, zero_m);                                \
+                                                                       \
+    tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                             \
+    int3_m += tmp2_m;                                                  \
+    int3_m += tmp3_m;                                                  \
+                                                                       \
+    SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);       \
+    PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);           \
+    PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);           \
+  }
 
-#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({  \
-  v8i16 c0_m, c1_m;                                    \
-                                                       \
-  SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m);    \
-  c0_m = __msa_ilvev_h(c1_m, c0_m);                    \
-                                                       \
-  c0_m;                                                \
-})
+#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)    \
+  ({                                                  \
+    v8i16 c0_m, c1_m;                                 \
+                                                      \
+    SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
+    c0_m = __msa_ilvev_h(c1_m, c0_m);                 \
+                                                      \
+    c0_m;                                             \
+  })
 
 /* multiply and add macro */
-#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,        \
-                 out0, out1, out2, out3) {                              \
-  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
-  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
-                                                                        \
-  ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                        \
-  ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                        \
-  DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m,               \
-              cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
-  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
-  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);              \
-  DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m,               \
-              cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
-  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);          \
-  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);              \
-}
+#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,  \
+                 out2, out3)                                                  \
+  {                                                                           \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+                                                                              \
+    ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
+    ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
+    DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
+                cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1);                  \
+    DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
+                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3);                  \
+  }
 
 /* idct 8x8 macro */
-#define VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,               \
-                       out0, out1, out2, out3, out4, out5, out6, out7) {     \
-  v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;              \
-  v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;              \
-  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
-  v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64,        \
-    cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };                  \
-                                                                             \
-  k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                   \
-  k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                   \
-  k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                   \
-  k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                   \
-  VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5);  \
-  SUB2(in1, in3, in7, in5, res0_m, res1_m);                                  \
-  k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                   \
-  k1_m = __msa_splati_h(mask_m, 4);                                          \
-                                                                             \
-  ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                               \
-  DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,        \
-              tmp0_m, tmp1_m, tmp2_m, tmp3_m);                               \
-  SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);               \
-  tp4_m = in1 + in3;                                                         \
-  PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                 \
-  tp7_m = in7 + in5;                                                         \
-  k2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
-  k3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
-  VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m,                       \
-           in0, in4, in2, in6);                                              \
-  BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);               \
-  BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m,        \
-              out0, out1, out2, out3, out4, out5, out6, out7);               \
-}
+#define VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
+                       out2, out3, out4, out5, out6, out7)                    \
+  {                                                                           \
+    v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;             \
+    v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;             \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v8i16 mask_m = { cospi_28_64, cospi_4_64,  cospi_20_64,  cospi_12_64,     \
+                     cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };  \
+                                                                              \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5);                                  \
+    k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0);                                  \
+    k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3);                                  \
+    k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2);                                  \
+    VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
+    SUB2(in1, in3, in7, in5, res0_m, res1_m);                                 \
+    k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7);                                  \
+    k1_m = __msa_splati_h(mask_m, 4);                                         \
+                                                                              \
+    ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                              \
+    DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,       \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
+    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+    tp4_m = in1 + in3;                                                        \
+    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                \
+    tp7_m = in7 + in5;                                                        \
+    k2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                      \
+    k3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                       \
+    VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
+    BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);              \
+    BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
+                out1, out2, out3, out4, out5, out6, out7);                    \
+  }
 
-#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                    \
-  v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                \
-  v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;          \
-  v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64,                  \
-    cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };    \
-  v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64,                \
-    cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 };      \
-  v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64,                 \
-    -cospi_16_64, 0, 0, 0, 0 };                                            \
-                                                                           \
-  k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                \
-  k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                \
-  ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                     \
-  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
-              r0_m, r1_m, r2_m, r3_m);                                     \
-  k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                \
-  k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                \
-  ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                     \
-  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
-              r4_m, r5_m, r6_m, r7_m);                                     \
-  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
-       m0_m, m1_m, m2_m, m3_m);                                            \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                     \
-  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
-       m0_m, m1_m, m2_m, m3_m);                                            \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                         \
-  k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                \
-  k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                \
-  ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                     \
-  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
-              r0_m, r1_m, r2_m, r3_m);                                     \
-  k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                \
-  k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                \
-  ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                     \
-  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
-              r4_m, r5_m, r6_m, r7_m);                                     \
-  ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
-       m0_m, m1_m, m2_m, m3_m);                                            \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                     \
-  SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m,                     \
-       m0_m, m1_m, m2_m, m3_m);                                            \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                         \
-  ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                     \
-  BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);        \
-  k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                \
-  k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                \
-  ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                   \
-  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
-              r0_m, r1_m, r2_m, r3_m);                                     \
-  k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                \
-  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
-              r4_m, r5_m, r6_m, r7_m);                                     \
-  ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
-       m0_m, m1_m, m2_m, m3_m);                                            \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                          \
-  SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m,                     \
-       m0_m, m1_m, m2_m, m3_m);                                            \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                           \
-  k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                \
-  k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                \
-  ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                     \
-  DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m,          \
-              m0_m, m1_m, m2_m, m3_m);                                     \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                          \
-  ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                       \
-  DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m,              \
-              m0_m, m1_m, m2_m, m3_m);                                     \
-  SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                     \
-  PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                          \
-                                                                           \
-  out1 = -in1;                                                             \
-  out3 = -in3;                                                             \
-  out5 = -in5;                                                             \
-  out7 = -in7;                                                             \
-}
+#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
+                        out2, out3, out4, out5, out6, out7)                   \
+  {                                                                           \
+    v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                     \
+    v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                 \
+    v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;           \
+    v8i16 mask1_m = { cospi_2_64,  cospi_30_64,  -cospi_2_64, cospi_10_64,    \
+                      cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
+    v8i16 mask2_m = { cospi_14_64,  -cospi_18_64, cospi_26_64, cospi_6_64,    \
+                      -cospi_26_64, cospi_8_64,   cospi_24_64, -cospi_8_64 }; \
+    v8i16 mask3_m = {                                                         \
+      -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0         \
+    };                                                                        \
+                                                                              \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1);                                 \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2);                                 \
+    ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                      \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
+                r1_m, r2_m, r3_m);                                            \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7);                                 \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1);                                 \
+    ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                      \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
+                r5_m, r6_m, r7_m);                                            \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
+         m3_m);                                                               \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                      \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
+         m3_m);                                                               \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                          \
+    k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4);                                 \
+    k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5);                                 \
+    ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                      \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
+                r1_m, r2_m, r3_m);                                            \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3);                                 \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4);                                 \
+    ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                      \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
+                r5_m, r6_m, r7_m);                                            \
+    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
+         m3_m);                                                               \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                      \
+    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
+         m3_m);                                                               \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                          \
+    ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                      \
+    BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);         \
+    k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6);                                 \
+    k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7);                                 \
+    ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                    \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
+                r1_m, r2_m, r3_m);                                            \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1);                                 \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m,   \
+                r6_m, r7_m);                                                  \
+    ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
+         m3_m);                                                               \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                           \
+    SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
+         m3_m);                                                               \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                            \
+    k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2);                                 \
+    k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3);                                 \
+    ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                      \
+    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m,     \
+                m1_m, m2_m, m3_m);                                            \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                           \
+    ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                        \
+    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m,   \
+                m2_m, m3_m);                                                  \
+    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
+    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                           \
+                                                                              \
+    out1 = -in1;                                                              \
+    out3 = -in3;                                                              \
+    out5 = -in5;                                                              \
+    out7 = -in7;                                                              \
+  }
 
-#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8,        \
-                         r9, r10, r11, r12, r13, r14, r15,          \
-                         out0, out1, out2, out3, out4, out5,        \
-                         out6, out7, out8, out9, out10, out11,      \
-                         out12, out13, out14, out15) {              \
-  v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;             \
-  v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;       \
-  v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;             \
-  v8i16 h8_m, h9_m, h10_m, h11_m;                                   \
-  v8i16 k0_m, k1_m, k2_m, k3_m;                                     \
-                                                                    \
-  /* stage 1 */                                                     \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);               \
-  k1_m = VPX_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);              \
-  k2_m = VPX_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);              \
-  k3_m = VPX_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);             \
-  MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,                  \
-          g0_m, g1_m, g2_m, g3_m);                                  \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);               \
-  k1_m = VPX_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);              \
-  k2_m = VPX_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);              \
-  k3_m = VPX_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);             \
-  MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,                 \
-          g4_m, g5_m, g6_m, g7_m);                                  \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);               \
-  k1_m = VPX_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);              \
-  k2_m = VPX_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);               \
-  k3_m = VPX_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);              \
-  MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,                 \
-          g8_m, g9_m, g10_m, g11_m);                                \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);              \
-  k1_m = VPX_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);             \
-  k2_m = VPX_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);               \
-  k3_m = VPX_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);              \
-  MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,                  \
-          g12_m, g13_m, g14_m, g15_m);                              \
-                                                                    \
-  /* stage 2 */                                                     \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);               \
-  k1_m = VPX_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);              \
-  k2_m = VPX_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);              \
-  MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,          \
-          h0_m, h1_m, h2_m, h3_m);                                  \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);              \
-  k1_m = VPX_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);             \
-  k2_m = VPX_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);             \
-  MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,         \
-          h4_m, h5_m, h6_m, h7_m);                                  \
-  BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);    \
-  BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,    \
-              h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);    \
-                                                                    \
-  /* stage 3 */                                                     \
-  BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);  \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);               \
-  k1_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);              \
-  k2_m = VPX_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);              \
-  MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,           \
-          out4, out6, out5, out7);                                  \
-  MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,           \
-          out12, out14, out13, out15);                              \
-                                                                    \
-  /* stage 4 */                                                     \
-  k0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);              \
-  k1_m = VPX_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);            \
-  k2_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);             \
-  k3_m = VPX_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);             \
-  MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                 \
-  MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                   \
-  MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);               \
-  MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);               \
-}
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,     \
+                         r12, r13, r14, r15, out0, out1, out2, out3, out4,     \
+                         out5, out6, out7, out8, out9, out10, out11, out12,    \
+                         out13, out14, out15)                                  \
+  {                                                                            \
+    v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;                      \
+    v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;                \
+    v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;                      \
+    v8i16 h8_m, h9_m, h10_m, h11_m;                                            \
+    v8i16 k0_m, k1_m, k2_m, k3_m;                                              \
+                                                                               \
+    /* stage 1 */                                                              \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);                        \
+    k1_m = VPX_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);                       \
+    k2_m = VPX_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);                       \
+    k3_m = VPX_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);                      \
+    MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m);  \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);                        \
+    k1_m = VPX_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);                       \
+    k2_m = VPX_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);                       \
+    k3_m = VPX_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);                      \
+    MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);                        \
+    k1_m = VPX_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);                       \
+    k2_m = VPX_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);                        \
+    k3_m = VPX_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);                       \
+    MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m,       \
+            g11_m);                                                            \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);                       \
+    k1_m = VPX_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);                      \
+    k2_m = VPX_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);                        \
+    k3_m = VPX_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);                       \
+    MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m,      \
+            g15_m);                                                            \
+                                                                               \
+    /* stage 2 */                                                              \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);                        \
+    k1_m = VPX_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);                       \
+    k2_m = VPX_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);                       \
+    MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
+            h3_m);                                                             \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);                       \
+    k1_m = VPX_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);                      \
+    k2_m = VPX_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);                      \
+    MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m,      \
+            h6_m, h7_m);                                                       \
+    BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);             \
+    BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
+                h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);                         \
+                                                                               \
+    /* stage 3 */                                                              \
+    BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);           \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
+    k1_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
+    k2_m = VPX_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);                       \
+    MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5,  \
+            out7);                                                             \
+    MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14,      \
+            out13, out15);                                                     \
+                                                                               \
+    /* stage 4 */                                                              \
+    k0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                       \
+    k1_m = VPX_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);                     \
+    k2_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                      \
+    k3_m = VPX_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);                      \
+    MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                          \
+    MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                            \
+    MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);                        \
+    MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);                        \
+  }
 
 void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
                                       int32_t dst_stride);
diff --git a/vpx_dsp/mips/itrans16_dspr2.c b/vpx_dsp/mips/itrans16_dspr2.c
index 6d41e6190b78c032023aa851efc9c65c71c7c8f1..0ec0c2059f4463c6c84185892f9e4ec2d9375a8f 100644
--- a/vpx_dsp/mips/itrans16_dspr2.c
+++ b/vpx_dsp/mips/itrans16_dspr2.c
@@ -26,11 +26,11 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
   int result1, result2, result3, result4;
   const int const_2_power_13 = 8192;
 
-  for (i = no_rows; i--; ) {
+  for (i = no_rows; i--;) {
     /* prefetch row */
     prefetch_load((const uint8_t *)(input + 16));
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lh       %[load1],              0(%[input])                    \n\t"
         "lh       %[load2],             16(%[input])                    \n\t"
         "lh       %[load3],              8(%[input])                    \n\t"
@@ -64,19 +64,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
         "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
         "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+          [step1_3] "=r"(step1_3)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+          [cospi_16_64] "r"(cospi_16_64));
+
+    __asm__ __volatile__(
         "lh       %[load5],             2(%[input])                     \n\t"
         "lh       %[load6],             30(%[input])                    \n\t"
         "lh       %[load7],             18(%[input])                    \n\t"
@@ -126,19 +125,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step2_8],           %[result1],     %[result2]      \n\t"
         "add      %[step2_15],          %[result4],     %[result3]      \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+          [load8] "=&r"(load8), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [result3] "=&r"(result3),
+          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+          [step2_14] "=r"(step2_14)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             10(%[input])                    \n\t"
         "lh       %[load2],             22(%[input])                    \n\t"
         "lh       %[load3],             26(%[input])                    \n\t"
@@ -188,19 +186,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step2_11],          %[result1],     %[result2]      \n\t"
         "add      %[step2_12],          %[result4],     %[result3]      \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [result3] "=&r"(result3),
+          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+          [step2_13] "=r"(step2_13)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
         "lh       %[load5],             4(%[input])                     \n\t"
         "lh       %[load6],             28(%[input])                    \n\t"
         "lh       %[load7],             20(%[input])                    \n\t"
@@ -253,19 +250,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step1_4],           %[result1],     %[result2]      \n\t"
         "add      %[step1_7],           %[result4],     %[result3]      \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+          [load8] "=&r"(load8), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [result3] "=&r"(result3),
+          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+          [step1_7] "=r"(step1_7)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+          [cospi_16_64] "r"(cospi_16_64));
+
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
@@ -305,18 +301,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
         "extp     %[step1_11],          $ac2,           31              \n\t"
         "extp     %[step1_12],          $ac3,           31              \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+          [step1_13] "=r"(step1_13)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
+
+    __asm__ __volatile__(
         "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
         "add      %[load5],             %[load5],       %[step2_12]     \n\t"
         "add      %[load5],             %[load5],       %[step2_15]     \n\t"
@@ -350,17 +344,15 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
         "sh       %[load5],             448(%[output])                  \n\t"
         "sh       %[load6],             480(%[output])                  \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6)
-        : [output] "r" (output),
-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
-          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
-          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
-          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
-          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
-    );
-
-    __asm__ __volatile__ (
+        : [load5] "=&r"(load5), [load6] "=&r"(load6)
+        : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
+          [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
+          [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
+          [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
+          [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
+          [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
+
+    __asm__ __volatile__(
         "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
         "add      %[load5],             %[load5],       %[step1_13]     \n\t"
         "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
@@ -386,21 +378,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output,
         "sh       %[load5],             384(%[output])                  \n\t"
         "sh       %[load6],             416(%[output])                  \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6)
-        : [output] "r" (output),
-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
-    );
+        : [load5] "=&r"(load5), [load6] "=&r"(load6)
+        : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+          [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
+          [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
 
     input += 16;
     output += 1;
   }
 }
 
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                               int dest_stride) {
+void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
   int step1_8, step1_9, step1_10, step1_11;
@@ -416,9 +405,9 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
   /* prefetch vpx_ff_cropTbl */
   prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl +  32);
-  prefetch_load(vpx_ff_cropTbl +  64);
-  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 32);
+  prefetch_load(vpx_ff_cropTbl + 64);
+  prefetch_load(vpx_ff_cropTbl + 96);
   prefetch_load(vpx_ff_cropTbl + 128);
   prefetch_load(vpx_ff_cropTbl + 160);
   prefetch_load(vpx_ff_cropTbl + 192);
@@ -426,7 +415,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
   for (i = 0; i < 16; ++i) {
     dest_pix = (dest + i);
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lh       %[load1],              0(%[input])                    \n\t"
         "lh       %[load2],             16(%[input])                    \n\t"
         "lh       %[load3],              8(%[input])                    \n\t"
@@ -460,19 +449,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
         "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
-          [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
+          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
+          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
+          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+          [step1_3] "=r"(step1_3)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+          [cospi_16_64] "r"(cospi_16_64));
+
+    __asm__ __volatile__(
         "lh       %[load5],             2(%[input])                     \n\t"
         "lh       %[load6],             30(%[input])                    \n\t"
         "lh       %[load7],             18(%[input])                    \n\t"
@@ -522,19 +510,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step2_8],           %[result1],     %[result2]      \n\t"
         "add      %[step2_15],          %[result4],     %[result3]      \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
-          [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+          [load8] "=&r"(load8), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [result3] "=&r"(result3),
+          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
+          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
+          [step2_14] "=r"(step2_14)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             10(%[input])                    \n\t"
         "lh       %[load2],             22(%[input])                    \n\t"
         "lh       %[load3],             26(%[input])                    \n\t"
@@ -584,19 +571,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step2_11],          %[result1],     %[result2]      \n\t"
         "add      %[step2_12],          %[result4],     %[result3]      \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [result3] "=&r"(result3),
+          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
+          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+          [step2_13] "=r"(step2_13)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
+
+    __asm__ __volatile__(
         "lh       %[load5],             4(%[input])                   \n\t"
         "lh       %[load6],             28(%[input])                  \n\t"
         "lh       %[load7],             20(%[input])                  \n\t"
@@ -650,19 +636,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step1_4],           %[result1],     %[result2]      \n\t"
         "add      %[step1_7],           %[result4],     %[result3]      \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [load7] "=&r" (load7), [load8] "=&r" (load8),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [result3] "=&r" (result3), [result4] "=&r" (result4),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+          [load8] "=&r"(load8), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [result3] "=&r"(result3),
+          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
+          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+          [step1_7] "=r"(step1_7)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+          [cospi_16_64] "r"(cospi_16_64));
+
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
@@ -702,23 +687,21 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "extp     %[step1_11],          $ac2,           31              \n\t"
         "extp     %[step1_12],          $ac3,           31              \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6),
-          [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
-          [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
-          [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
-          [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
+          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
+          [step1_13] "=r"(step1_13)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
+          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
+          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
+          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
+          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
 
     step1_8 = step2_8 + step2_11;
     step1_9 = step2_9 + step2_10;
     step1_14 = step2_13 + step2_14;
     step1_15 = step2_12 + step2_15;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[load7],         0(%[dest_pix])                      \n\t"
         "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
         "add      %[load5],         %[load5],           %[step1_15]     \n\t"
@@ -870,18 +853,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[load6],         %[load8](%[cm])                     \n\t"
         "sb       %[load6],         0(%[dest_pix])                      \n\t"
 
-        : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
-          [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
-          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
-          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
-    );
+        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
+          [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
+        :
+        [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+        [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
+        [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+        [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
+        [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
+        [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
+        [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
 
     input += 16;
   }
@@ -889,15 +870,11 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
 void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
                                  int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
   uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 
   // First transform rows
   idct16_rows_dspr2(input, out, 16);
@@ -908,17 +885,13 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
 
 void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
                                 int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[16 * 16]);
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
   int16_t *outptr = out;
   uint32_t i;
   uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -926,7 +899,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
 
   outptr += 4;
   for (i = 0; i < 6; ++i) {
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sw     $zero,    0(%[outptr])     \n\t"
         "sw     $zero,   32(%[outptr])     \n\t"
         "sw     $zero,   64(%[outptr])     \n\t"
@@ -945,8 +918,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
         "sw     $zero,  480(%[outptr])     \n\t"
 
         :
-        : [outptr] "r" (outptr)
-    );
+        : [outptr] "r"(outptr));
 
     outptr += 2;
   }
@@ -966,35 +938,31 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
   int32_t vector_1, vector_2, vector_3, vector_4;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 
-    :
-    : [pos] "r" (pos)
-  );
+                       :
+                       : [pos] "r"(pos));
 
   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "addi     %[out],     %[out],     32      \n\t"
       "sra      %[a1],      %[out],     6       \n\t"
 
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
+      : [out] "+r"(out), [a1] "=r"(a1)
+      :);
 
   if (a1 < 0) {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "abs        %[absa1],       %[a1]       \n\t"
         "replv.qb   %[vector_a1],   %[absa1]    \n\t"
 
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+        : [a1] "r"(a1));
 
     for (r = 16; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw             %[t1],          0(%[dest])                      \n\t"
           "lw             %[t2],          4(%[dest])                      \n\t"
           "lw             %[t3],          8(%[dest])                      \n\t"
@@ -1009,25 +977,22 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw             %[vector_4],    12(%[dest])                     \n\t"
           "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
 
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
     }
   } else {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
 
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+                         : [vector_a1] "=r"(vector_a1)
+                         : [a1] "r"(a1));
 
     for (r = 16; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw             %[t1],          0(%[dest])                      \n\t"
           "lw             %[t2],          4(%[dest])                      \n\t"
           "lw             %[t3],          8(%[dest])                      \n\t"
@@ -1042,12 +1007,11 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw             %[vector_4],    12(%[dest])                     \n\t"
           "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
 
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
@@ -1072,21 +1036,20 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
   int x14 = input[1];
   int x15 = input[14];
 
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
-           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = output[8]
-              = output[9] = output[10] = output[11] = output[12]
-              = output[13] = output[14] = output[15] = 0;
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+        x13 | x14 | x15)) {
+    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+        output[6] = output[7] = output[8] = output[9] = output[10] =
+            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
     return;
   }
 
   // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
@@ -1095,9 +1058,9 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
   x0 = dct_const_round_shift(s0 + s8);
   x1 = dct_const_round_shift(s1 + s9);
@@ -1107,8 +1070,8 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
   x5 = dct_const_round_shift(s5 + s13);
   x6 = dct_const_round_shift(s6 + s14);
   x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
+  x8 = dct_const_round_shift(s0 - s8);
+  x9 = dct_const_round_shift(s1 - s9);
   x10 = dct_const_round_shift(s2 - s10);
   x11 = dct_const_round_shift(s3 - s11);
   x12 = dct_const_round_shift(s4 - s12);
@@ -1125,14 +1088,14 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
   s5 = x5;
   s6 = x6;
   s7 = x7;
-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
   x0 = s0 + s4;
   x1 = s1 + s5;
@@ -1156,18 +1119,18 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
   s8 = x8;
   s9 = x9;
   s10 = x10;
   s11 = x11;
-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
   x0 = s0 + s2;
   x1 = s1 + s3;
@@ -1187,13 +1150,13 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
   x15 = dct_const_round_shift(s13 - s15);
 
   // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
+  s2 = (-cospi_16_64) * (x2 + x3);
   s3 = cospi_16_64 * (x2 - x3);
   s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (- x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
   s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (- x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
   x2 = dct_const_round_shift(s2);
@@ -1205,23 +1168,22 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) {
   x14 = dct_const_round_shift(s14);
   x15 = dct_const_round_shift(s15);
 
-  output[0] =  x0;
+  output[0] = x0;
   output[1] = -x8;
-  output[2] =  x12;
+  output[2] = x12;
   output[3] = -x4;
-  output[4] =  x6;
-  output[5] =  x14;
-  output[6] =  x10;
-  output[7] =  x2;
-  output[8] =  x3;
-  output[9] =  x11;
-  output[10] =  x15;
-  output[11] =  x7;
-  output[12] =  x5;
+  output[4] = x6;
+  output[5] = x14;
+  output[6] = x10;
+  output[7] = x2;
+  output[8] = x3;
+  output[9] = x11;
+  output[10] = x15;
+  output[11] = x7;
+  output[12] = x5;
   output[13] = -x13;
-  output[14] =  x9;
+  output[14] = x9;
   output[15] = -x1;
 }
 
-
 #endif  // HAVE_DSPR2
diff --git a/vpx_dsp/mips/itrans32_cols_dspr2.c b/vpx_dsp/mips/itrans32_cols_dspr2.c
index 553acb0f5bfd96885447377357fd9966d7f83df0..ce25d55c9c0bb23b4a3bfe28ca14cd84101a0613 100644
--- a/vpx_dsp/mips/itrans32_cols_dspr2.c
+++ b/vpx_dsp/mips/itrans32_cols_dspr2.c
@@ -39,9 +39,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
   /* prefetch vpx_ff_cropTbl */
   prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl +  32);
-  prefetch_load(vpx_ff_cropTbl +  64);
-  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 32);
+  prefetch_load(vpx_ff_cropTbl + 64);
+  prefetch_load(vpx_ff_cropTbl + 96);
   prefetch_load(vpx_ff_cropTbl + 128);
   prefetch_load(vpx_ff_cropTbl + 160);
   prefetch_load(vpx_ff_cropTbl + 192);
@@ -51,7 +51,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
     dest_pix = dest + i;
     dest_pix1 = dest + i + 31 * dest_stride;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lh       %[load1],             2(%[input])                     \n\t"
         "lh       %[load2],             62(%[input])                    \n\t"
         "lh       %[load3],             34(%[input])                    \n\t"
@@ -101,18 +101,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
-          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
-          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
+          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
+          [step1_31] "=r"(step1_31)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             18(%[input])                    \n\t"
         "lh       %[load2],             46(%[input])                    \n\t"
         "lh       %[load3],             50(%[input])                    \n\t"
@@ -162,18 +161,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
-          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
-          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
+          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
+          [step1_29] "=r"(step1_29)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             10(%[input])                    \n\t"
         "lh       %[load2],             54(%[input])                    \n\t"
         "lh       %[load3],             42(%[input])                    \n\t"
@@ -223,18 +221,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
-          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
-          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
+          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
+          [step1_27] "=r"(step1_27)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             26(%[input])                    \n\t"
         "lh       %[load2],             38(%[input])                    \n\t"
         "lh       %[load3],             58(%[input])                    \n\t"
@@ -280,18 +277,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
-          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
-          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
+          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
+          [step1_25] "=r"(step1_25)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],              4(%[input])                    \n\t"
         "lh       %[load2],             60(%[input])                    \n\t"
         "lh       %[load3],             36(%[input])                    \n\t"
@@ -337,18 +333,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
         "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
-          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
+          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
+          [step2_15] "=r"(step2_15)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             20(%[input])                    \n\t"
         "lh       %[load2],             44(%[input])                    \n\t"
         "lh       %[load3],             52(%[input])                    \n\t"
@@ -394,18 +389,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
         "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
-          [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
+          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+          [step2_13] "=r"(step2_13)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
@@ -440,33 +434,31 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "extp     %[step3_11],          $ac2,           31              \n\t"
         "extp     %[step3_12],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
-          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
-          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
-          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
-        : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
-          [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
-          [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
-          [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
-          [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
+          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
+          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
+          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
+          [step3_15] "=r"(step3_15)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 
     step2_18 = step1_17 - step1_18;
     step2_29 = step1_30 - step1_29;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
         "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
         "extp     %[step3_18],          $ac0,           31              \n\t"
 
-        : [step3_18] "=r" (step3_18)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_18] "=r"(step3_18)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
+          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
     step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -474,18 +466,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
     step2_19 = step1_16 - step1_19;
     step2_28 = step1_31 - step1_28;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
         "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
         "extp     %[step3_19],          $ac0,           31              \n\t"
 
-        : [step3_19] "=r" (step3_19)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_19] "=r"(step3_19)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
+          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
     step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -498,18 +489,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
     step2_20 = step1_23 - step1_20;
     step2_27 = step1_24 - step1_27;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
         "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
         "extp     %[step3_20],          $ac0,           31              \n\t"
 
-        : [step3_20] "=r" (step3_20)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_20] "=r"(step3_20)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
     step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -517,18 +507,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
     step2_21 = step1_22 - step1_21;
     step2_26 = step1_25 - step1_26;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
         "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
         "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
         "extp     %[step3_21],          $ac1,           31              \n\t"
 
-        : [step3_21] "=r" (step3_21)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_21] "=r"(step3_21)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
+          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
     step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -556,7 +545,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
     step2_30 = step3_30 + step3_25;
     step2_31 = step3_31 + step3_24;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lh       %[load1],             0(%[input])                     \n\t"
         "lh       %[load2],             32(%[input])                    \n\t"
         "lh       %[load3],             16(%[input])                    \n\t"
@@ -588,19 +577,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
         "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
+          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+          [step1_3] "=r"(step1_3)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
+          [cospi_16_64] "r"(cospi_16_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             8(%[input])                     \n\t"
         "lh       %[load2],             56(%[input])                    \n\t"
         "lh       %[load3],             40(%[input])                    \n\t"
@@ -649,17 +636,15 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
         "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
+          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+          [step1_7] "=r"(step1_7)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+          [cospi_16_64] "r"(cospi_16_64));
 
     step2_0 = step1_0 + step1_7;
     step2_1 = step1_1 + step1_6;
@@ -688,67 +673,63 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
     step1_14 = step2_1 - step3_14;
     step1_15 = step2_0 - step3_15;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_20],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
-        : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
-          [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_20 + step2_27) * cospi_16_64;
     step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_21],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
-        : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
-          [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
+          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_21 + step2_26) * cospi_16_64;
     step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_22],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
-        : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
-          [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
+          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_22 + step2_25) * cospi_16_64;
     step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_23],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
-        : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
-          [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
+          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_23 + step2_24) * cospi_16_64;
     step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
         "addi     %[temp0],         %[temp0],           32              \n\t"
@@ -783,21 +764,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
-          [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
-          [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
-          [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
+          [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
+          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
+          [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
+          [step2_31] "r"(step2_31));
 
     step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
     step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
     step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
     step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
@@ -820,14 +800,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
         "addi     %[temp0],         %[temp0],           32              \n\t"
@@ -862,21 +841,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
-          [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
-          [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
-          [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
+          [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
+          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
+          [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
+          [step1_27] "r"(step1_27));
 
     step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
     step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
     step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
     step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
@@ -899,14 +877,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
         "addi     %[temp0],         %[temp0],           32              \n\t"
@@ -941,21 +918,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
         "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
-          [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
-          [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
-          [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
+          [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
+          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
+          [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
+          [step1_23] "r"(step1_23));
 
     step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
     step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
     step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
     step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
@@ -978,14 +954,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
         "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
         "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
         "addi     %[temp0],         %[temp0],           32              \n\t"
@@ -1019,21 +994,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix])                      \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
-          [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
-          [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
-          [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
+          [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
+          [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
+          [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
 
     step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
     step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
     step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
     step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
         "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
         "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
@@ -1055,12 +1029,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
         "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
-          [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
-        : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
-          [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
-          [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
+          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
+        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
+          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
+          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
 
     input += 32;
   }
diff --git a/vpx_dsp/mips/itrans32_dspr2.c b/vpx_dsp/mips/itrans32_dspr2.c
index 523da1df1bc7bb45e78b94f29041e4a7a990c114..d71c5ffed512feeb347abd147c352b303759b66a 100644
--- a/vpx_dsp/mips/itrans32_dspr2.c
+++ b/vpx_dsp/mips/itrans32_dspr2.c
@@ -40,16 +40,16 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
   const int const_2_power_13 = 8192;
   const int32_t *input_int;
 
-  for (i = no_rows; i--; ) {
+  for (i = no_rows; i--;) {
     input_int = (const int32_t *)input;
 
-    if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
-          input_int[4]  | input_int[5]  | input_int[6]  | input_int[7]  |
-          input_int[8]  | input_int[9]  | input_int[10] | input_int[11] |
+    if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
+          input_int[4] | input_int[5] | input_int[6] | input_int[7] |
+          input_int[8] | input_int[9] | input_int[10] | input_int[11] |
           input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
       input += 32;
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "sh     $zero,     0(%[output])     \n\t"
           "sh     $zero,    64(%[output])     \n\t"
           "sh     $zero,   128(%[output])     \n\t"
@@ -84,8 +84,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
           "sh     $zero,  1984(%[output])     \n\t"
 
           :
-          : [output] "r" (output)
-      );
+          : [output] "r"(output));
 
       output += 1;
 
@@ -96,7 +95,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
     prefetch_load((const uint8_t *)(input + 32));
     prefetch_load((const uint8_t *)(input + 48));
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lh       %[load1],             2(%[input])                     \n\t"
         "lh       %[load2],             62(%[input])                    \n\t"
         "lh       %[load3],             34(%[input])                    \n\t"
@@ -146,19 +145,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
-          [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
-          [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
+          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
+          [step1_31] "=r"(step1_31)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
+          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             18(%[input])                    \n\t"
         "lh       %[load2],             46(%[input])                    \n\t"
         "lh       %[load3],             50(%[input])                    \n\t"
@@ -208,19 +205,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
-          [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
-          [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
+          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
+          [step1_29] "=r"(step1_29)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
+          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             10(%[input])                    \n\t"
         "lh       %[load2],             54(%[input])                    \n\t"
         "lh       %[load3],             42(%[input])                    \n\t"
@@ -270,19 +265,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
-          [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
-          [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
+          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
+          [step1_27] "=r"(step1_27)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
+          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
+          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             26(%[input])                    \n\t"
         "lh       %[load2],             38(%[input])                    \n\t"
         "lh       %[load3],             58(%[input])                    \n\t"
@@ -332,19 +325,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
         "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
-          [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
-          [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
-          [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
+          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
+          [step1_25] "=r"(step1_25)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
+          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
+          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],              4(%[input])                    \n\t"
         "lh       %[load2],             60(%[input])                    \n\t"
         "lh       %[load3],             36(%[input])                    \n\t"
@@ -394,19 +385,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
         "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
-          [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
-          [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
+          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
+          [step2_15] "=r"(step2_15)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
+          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
+          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+    __asm__ __volatile__(
         "lh       %[load1],             20(%[input])                    \n\t"
         "lh       %[load2],             44(%[input])                    \n\t"
         "lh       %[load3],             52(%[input])                    \n\t"
@@ -456,19 +445,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
         "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
-          [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
-          [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
-          [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
-    );
-
-    __asm__ __volatile__ (
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
+          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
+          [step2_13] "=r"(step2_13)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
+          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
+          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
+
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
@@ -507,34 +494,31 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "extp     %[step3_11],          $ac2,           31              \n\t"
         "extp     %[step3_12],          $ac3,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
-          [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
-          [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
-          [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
-          [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
-          [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
-          [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
+          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
+          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
+          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
+          [step3_15] "=r"(step3_15)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
+          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
+          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
+          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
+          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
 
     step2_18 = step1_17 - step1_18;
     step2_29 = step1_30 - step1_29;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
         "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
         "extp     %[step3_18],          $ac0,           31              \n\t"
 
-        : [step3_18] "=r" (step3_18)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_18] "=r"(step3_18)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
+          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
     step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -542,18 +526,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
     step2_19 = step1_16 - step1_19;
     step2_28 = step1_31 - step1_28;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
         "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
         "extp     %[step3_19],          $ac0,           31              \n\t"
 
-        : [step3_19] "=r" (step3_19)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_19] "=r"(step3_19)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
+          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
     step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -566,18 +549,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
     step2_20 = step1_23 - step1_20;
     step2_27 = step1_24 - step1_27;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
         "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
         "extp     %[step3_20],          $ac0,           31              \n\t"
 
-        : [step3_20] "=r" (step3_20)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_20] "=r"(step3_20)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
     step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -585,18 +567,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
     step2_21 = step1_22 - step1_21;
     step2_26 = step1_25 - step1_26;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
         "mthi     $zero,                $ac1                            \n\t"
         "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
         "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
         "extp     %[step3_21],          $ac1,           31              \n\t"
 
-        : [step3_21] "=r" (step3_21)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
-    );
+        : [step3_21] "=r"(step3_21)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
+          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64));
 
     temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
     step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -624,7 +605,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
     step2_30 = step3_30 + step3_25;
     step2_31 = step3_31 + step3_24;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lh       %[load1],             0(%[input])                     \n\t"
         "lh       %[load2],             32(%[input])                    \n\t"
         "lh       %[load3],             16(%[input])                    \n\t"
@@ -658,20 +639,19 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
         "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [result1] "=&r" (result1), [result2] "=&r" (result2),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
-          [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_16_64] "r" (cospi_16_64),
-          [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [result1] "=&r"(result1),
+          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
+          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
+          [step1_3] "=r"(step1_3)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
+          [cospi_8_64] "r"(cospi_8_64)
 
-    );
+            );
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lh       %[load1],             8(%[input])                     \n\t"
         "lh       %[load2],             56(%[input])                    \n\t"
         "lh       %[load3],             40(%[input])                    \n\t"
@@ -724,17 +704,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
         "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
         "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
 
-        : [load1] "=&r" (load1), [load2] "=&r" (load2),
-          [load3] "=&r" (load3), [load4] "=&r" (load4),
-          [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
-          [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
-          [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
-          [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
-        : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
+          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
+          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
+          [step1_7] "=r"(step1_7)
+        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
+          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
+          [cospi_16_64] "r"(cospi_16_64));
 
     step2_0 = step1_0 + step1_7;
     step2_1 = step1_1 + step1_6;
@@ -762,66 +740,58 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
     step1_14 = step2_1 - step3_14;
     step1_15 = step2_0 - step3_15;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_20],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
+          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_20 + step2_27) * cospi_16_64;
     step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_21],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
+          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_21 + step2_26) * cospi_16_64;
     step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_22],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
+          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_22 + step2_25) * cospi_16_64;
     step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
         "mthi     $zero,                $ac0                            \n\t"
         "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
         "extp     %[step1_23],          $ac0,           31              \n\t"
 
-        : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
-          [cospi_16_64] "r" (cospi_16_64)
-    );
+        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
+        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
+          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
 
     temp21 = (step2_23 + step2_24) * cospi_16_64;
     step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
@@ -867,16 +837,14 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
 
 void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
                                   int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
   int16_t *outptr = out;
   uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   // Rows
   idct32_rows_dspr2(input, outptr, 32);
@@ -887,23 +855,21 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
                                 int stride) {
-  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
   int16_t *outptr = out;
   uint32_t i;
   uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   // Rows
   idct32_rows_dspr2(input, outptr, 8);
 
   outptr += 8;
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "sw     $zero,      0(%[outptr])     \n\t"
       "sw     $zero,      4(%[outptr])     \n\t"
       "sw     $zero,      8(%[outptr])     \n\t"
@@ -918,13 +884,12 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
       "sw     $zero,     44(%[outptr])     \n\t"
 
       :
-      : [outptr] "r" (outptr)
-  );
+      : [outptr] "r"(outptr));
 
   for (i = 0; i < 31; ++i) {
     outptr += 32;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "sw     $zero,      0(%[outptr])     \n\t"
         "sw     $zero,      4(%[outptr])     \n\t"
         "sw     $zero,      8(%[outptr])     \n\t"
@@ -939,8 +904,7 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
         "sw     $zero,     44(%[outptr])     \n\t"
 
         :
-        : [outptr] "r" (outptr)
-    );
+        : [outptr] "r"(outptr));
   }
 
   // Columns
@@ -949,43 +913,39 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
 
 void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
                                int stride) {
-  int       r, out;
-  int32_t   a1, absa1;
-  int32_t   vector_a1;
-  int32_t   t1, t2, t3, t4;
-  int32_t   vector_1, vector_2, vector_3, vector_4;
-  uint32_t  pos = 45;
+  int r, out;
+  int32_t a1, absa1;
+  int32_t vector_a1;
+  int32_t t1, t2, t3, t4;
+  int32_t vector_1, vector_2, vector_3, vector_4;
+  uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 
-    :
-    : [pos] "r" (pos)
-  );
+                       :
+                       : [pos] "r"(pos));
 
   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "addi     %[out],    %[out],    32      \n\t"
       "sra      %[a1],     %[out],    6       \n\t"
 
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
+      : [out] "+r"(out), [a1] "=r"(a1)
+      :);
 
   if (a1 < 0) {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "abs        %[absa1],     %[a1]         \n\t"
         "replv.qb   %[vector_a1], %[absa1]      \n\t"
 
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+        : [a1] "r"(a1));
 
     for (r = 32; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw             %[t1],          0(%[dest])                      \n\t"
           "lw             %[t2],          4(%[dest])                      \n\t"
           "lw             %[t3],          8(%[dest])                      \n\t"
@@ -1014,25 +974,22 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
 
           "add            %[dest],        %[dest],        %[stride]       \n\t"
 
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   } else {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb       %[vector_a1],   %[a1]     \n\t"
+    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
 
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+                         : [vector_a1] "=r"(vector_a1)
+                         : [a1] "r"(a1));
 
     for (r = 32; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw             %[t1],          0(%[dest])                      \n\t"
           "lw             %[t2],          4(%[dest])                      \n\t"
           "lw             %[t3],          8(%[dest])                      \n\t"
@@ -1061,12 +1018,11 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
 
           "add            %[dest],        %[dest],        %[stride]       \n\t"
 
-          : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
-            [dest] "+&r" (dest)
-          : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
+            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
+            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
+            [dest] "+&r"(dest)
+          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
diff --git a/vpx_dsp/mips/itrans4_dspr2.c b/vpx_dsp/mips/itrans4_dspr2.c
index ecb8bd3de7518d0039d92deeb52e9833953a3ca9..516ea80f4ae96179ebbc2c643130ab0f4a4d3543 100644
--- a/vpx_dsp/mips/itrans4_dspr2.c
+++ b/vpx_dsp/mips/itrans4_dspr2.c
@@ -15,13 +15,13 @@
 
 #if HAVE_DSPR2
 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t   step_0, step_1, step_2, step_3;
-  int       Temp0, Temp1, Temp2, Temp3;
+  int16_t step_0, step_1, step_2, step_3;
+  int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
-  int       i;
+  int i;
 
-  for (i = 4; i--; ) {
-    __asm__ __volatile__ (
+  for (i = 4; i--;) {
+    __asm__ __volatile__(
         /*
           temp_1 = (input[0] + input[2]) * cospi_16_64;
           step_0 = dct_const_round_shift(temp_1);
@@ -83,16 +83,12 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
         "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
         "sh       %[Temp3],             24(%[output])                   \n\t"
 
-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
-        [output] "+r" (output)
-      : [const_2_power_13] "r" (const_2_power_13),
-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
-        [cospi_24_64] "r" (cospi_24_64),
-        [input] "r" (input)
-    );
+        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
+        : [const_2_power_13] "r"(const_2_power_13),
+          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
 
     input += 4;
     output += 1;
@@ -101,27 +97,27 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
 
 void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                      int dest_stride) {
-  int16_t   step_0, step_1, step_2, step_3;
-  int       Temp0, Temp1, Temp2, Temp3;
+  int16_t step_0, step_1, step_2, step_3;
+  int Temp0, Temp1, Temp2, Temp3;
   const int const_2_power_13 = 8192;
-  int       i;
-  uint8_t   *dest_pix;
-  uint8_t   *cm = vpx_ff_cropTbl;
+  int i;
+  uint8_t *dest_pix;
+  uint8_t *cm = vpx_ff_cropTbl;
 
   /* prefetch vpx_ff_cropTbl */
   prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl +  32);
-  prefetch_load(vpx_ff_cropTbl +  64);
-  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 32);
+  prefetch_load(vpx_ff_cropTbl + 64);
+  prefetch_load(vpx_ff_cropTbl + 96);
   prefetch_load(vpx_ff_cropTbl + 128);
   prefetch_load(vpx_ff_cropTbl + 160);
   prefetch_load(vpx_ff_cropTbl + 192);
   prefetch_load(vpx_ff_cropTbl + 224);
 
   for (i = 0; i < 4; ++i) {
-      dest_pix = (dest + i);
+    dest_pix = (dest + i);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         /*
           temp_1 = (input[0] + input[2]) * cospi_16_64;
           step_0 = dct_const_round_shift(temp_1);
@@ -206,16 +202,14 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
 
-      : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-        [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-        [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
-        [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
-        [dest_pix] "+r" (dest_pix)
-      : [const_2_power_13] "r" (const_2_power_13),
-        [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
-        [cospi_24_64] "r" (cospi_24_64),
-        [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
-    );
+        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
+          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
+          [dest_pix] "+r"(dest_pix)
+        : [const_2_power_13] "r"(const_2_power_13),
+          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
+          [dest_stride] "r"(dest_stride));
 
     input += 4;
   }
@@ -228,11 +222,9 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
   uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
 
   // Rows
   vpx_idct4_rows_dspr2(input, outptr);
@@ -243,73 +235,63 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
 
 void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
                              int dest_stride) {
-  int       a1, absa1;
-  int       r;
-  int32_t   out;
-  int       t2, vector_a1, vector_a;
-  uint32_t  pos = 45;
-  int16_t   input_dc = input[0];
+  int a1, absa1;
+  int r;
+  int32_t out;
+  int t2, vector_a1, vector_a;
+  uint32_t pos = 45;
+  int16_t input_dc = input[0];
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 
-    :
-    : [pos] "r" (pos)
-  );
+                       :
+                       : [pos] "r"(pos));
 
   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "addi     %[out],     %[out],    8       \n\t"
       "sra      %[a1],      %[out],    4       \n\t"
 
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
+      : [out] "+r"(out), [a1] "=r"(a1)
+      :);
 
   if (a1 < 0) {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "abs        %[absa1],     %[a1]         \n\t"
         "replv.qb   %[vector_a1], %[absa1]      \n\t"
 
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+        : [a1] "r"(a1));
 
     for (r = 4; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw             %[t2],          0(%[dest])                      \n\t"
           "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
           "sw             %[vector_a],    0(%[dest])                      \n\t"
           "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
 
-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
     }
   } else {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb       %[vector_a1],   %[a1]     \n\t"
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
+                         : [vector_a1] "=r"(vector_a1)
+                         : [a1] "r"(a1));
 
     for (r = 4; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw           %[t2],          0(%[dest])                        \n\t"
           "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
           "sw           %[vector_a],    0(%[dest])                        \n\t"
           "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
 
-          : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
+          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
diff --git a/vpx_dsp/mips/itrans8_dspr2.c b/vpx_dsp/mips/itrans8_dspr2.c
index 823e845d59d5618396990d4b18c5e0ae962df131..08a6c78b6e4d2a6b066f82acce04405954514e08 100644
--- a/vpx_dsp/mips/itrans8_dspr2.c
+++ b/vpx_dsp/mips/itrans8_dspr2.c
@@ -20,8 +20,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
   int Temp0, Temp1, Temp2, Temp3, Temp4;
   int i;
 
-  for (i = no_rows; i--; ) {
-    __asm__ __volatile__ (
+  for (i = no_rows; i--;) {
+    __asm__ __volatile__(
         /*
           temp_1 = (input[0] + input[4]) * cospi_16_64;
           step2_0 = dct_const_round_shift(temp_1);
@@ -174,20 +174,18 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
         "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
         "sh       %[Temp1],             112(%[output])                  \n\t"
 
-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [Temp4] "=&r" (Temp4)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_24_64] "r" (cospi_24_64),
-          [output] "r" (output), [input] "r" (input)
-    );
+        : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+        : [const_2_power_13] "r"(const_2_power_13),
+          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+          [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
+          [input] "r"(input));
 
     input += 8;
     output += 1;
@@ -205,18 +203,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
 
   /* prefetch vpx_ff_cropTbl */
   prefetch_load(vpx_ff_cropTbl);
-  prefetch_load(vpx_ff_cropTbl +  32);
-  prefetch_load(vpx_ff_cropTbl +  64);
-  prefetch_load(vpx_ff_cropTbl +  96);
+  prefetch_load(vpx_ff_cropTbl + 32);
+  prefetch_load(vpx_ff_cropTbl + 64);
+  prefetch_load(vpx_ff_cropTbl + 96);
   prefetch_load(vpx_ff_cropTbl + 128);
   prefetch_load(vpx_ff_cropTbl + 160);
   prefetch_load(vpx_ff_cropTbl + 192);
   prefetch_load(vpx_ff_cropTbl + 224);
 
   for (i = 0; i < 8; ++i) {
-      dest_pix = (dest + i);
+    dest_pix = (dest + i);
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         /*
           temp_1 = (input[0] + input[4]) * cospi_16_64;
           step2_0 = dct_const_round_shift(temp_1);
@@ -423,20 +421,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
 
-        : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
-          [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
-          [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
-          [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
-          [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
-          [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
-          [dest_pix] "+r" (dest_pix)
-        : [const_2_power_13] "r" (const_2_power_13),
-          [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
-          [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
-          [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
-          [cospi_24_64] "r" (cospi_24_64),
-          [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
-    );
+        : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
+          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
+          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
+          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
+          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
+        : [const_2_power_13] "r"(const_2_power_13),
+          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
+          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
+          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
+          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
+          [dest_stride] "r"(dest_stride));
 
     input += 8;
   }
@@ -449,11 +445,7 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
   uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 
   // First transform rows
   idct8_rows_dspr2(input, outptr, 8);
@@ -469,18 +461,14 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
   uint32_t pos = 45;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp    %[pos],    1    \n\t"
-    :
-    : [pos] "r" (pos)
-  );
+  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
 
   // First transform rows
   idct8_rows_dspr2(input, outptr, 4);
 
   outptr += 4;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "sw  $zero,   0(%[outptr])  \n\t"
       "sw  $zero,   4(%[outptr])  \n\t"
       "sw  $zero,  16(%[outptr])  \n\t"
@@ -499,9 +487,7 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
       "sw  $zero, 116(%[outptr])  \n\t"
 
       :
-      : [outptr] "r" (outptr)
-  );
-
+      : [outptr] "r"(outptr));
 
   // Then transform columns and add to dest
   idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
@@ -516,35 +502,31 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
   int32_t t1, t2, vector_a1, vector_1, vector_2;
 
   /* bit positon for extract from acc */
-  __asm__ __volatile__ (
-    "wrdsp      %[pos],     1           \n\t"
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
 
-    :
-    : [pos] "r" (pos)
-  );
+                       :
+                       : [pos] "r"(pos));
 
   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "addi     %[out],     %[out],     16      \n\t"
       "sra      %[a1],      %[out],     5       \n\t"
 
-      : [out] "+r" (out), [a1] "=r" (a1)
-      :
-  );
+      : [out] "+r"(out), [a1] "=r"(a1)
+      :);
 
   if (a1 < 0) {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "abs        %[absa1],       %[a1]       \n\t"
         "replv.qb   %[vector_a1],   %[absa1]    \n\t"
 
-        : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
+        : [a1] "r"(a1));
 
     for (r = 8; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw           %[t1],          0(%[dest])                      \n\t"
           "lw           %[t2],          4(%[dest])                      \n\t"
           "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
@@ -553,24 +535,20 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw           %[vector_2],    4(%[dest])                      \n\t"
           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
 
-          : [t1] "=&r" (t1), [t2] "=&r" (t2),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [dest] "+&r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+            [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
+          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
     }
   } else {
     /* use quad-byte
      * input and output memory are four byte aligned */
-    __asm__ __volatile__ (
-        "replv.qb   %[vector_a1],   %[a1]   \n\t"
+    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
 
-        : [vector_a1] "=r" (vector_a1)
-        : [a1] "r" (a1)
-    );
+                         : [vector_a1] "=r"(vector_a1)
+                         : [a1] "r"(a1));
 
     for (r = 8; r--;) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw           %[t1],          0(%[dest])                      \n\t"
           "lw           %[t2],          4(%[dest])                      \n\t"
           "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
@@ -579,11 +557,9 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
           "sw           %[vector_2],    4(%[dest])                      \n\t"
           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
 
-          : [t1] "=&r" (t1), [t2] "=&r" (t2),
-            [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
-            [dest] "+r" (dest)
-          : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
-      );
+          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
+            [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
+          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
     }
   }
 }
@@ -602,20 +578,20 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
   x7 = input[6];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4]
-              = output[5] = output[6] = output[7] = 0;
+    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
+        output[6] = output[7] = 0;
     return;
   }
 
   // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
 
   x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
   x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
@@ -631,10 +607,10 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
 
   x0 = s0 + s2;
   x1 = s1 + s3;
@@ -656,13 +632,13 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) {
   x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
   x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
 
-  output[0] =  x0;
+  output[0] = x0;
   output[1] = -x4;
-  output[2] =  x6;
+  output[2] = x6;
   output[3] = -x2;
-  output[4] =  x3;
+  output[4] = x3;
   output[5] = -x7;
-  output[6] =  x5;
+  output[6] = x5;
   output[7] = -x1;
 }
 #endif  // HAVE_DSPR2
diff --git a/vpx_dsp/mips/loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c
index 0ad1dd2f2d2a86446cc8df8d17e0a345767a21d7..4aad863de02a0d6e48b7d54dce4e8ff9495079ea 100644
--- a/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/vpx_dsp/mips/loopfilter_16_msa.c
@@ -11,8 +11,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
 
-int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
-                                 uint8_t *filter48,
+int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
                                  const uint8_t *b_limit_ptr,
                                  const uint8_t *limit_ptr,
                                  const uint8_t *thresh_ptr) {
@@ -33,8 +32,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
   limit = (v16u8)__msa_fill_b(*limit_ptr);
 
   /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
@@ -43,9 +42,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
 
     return 1;
   } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
-               q2_r, q3_r);
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
@@ -107,8 +105,8 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   } else {
     src -= 7 * pitch;
 
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
-               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
                p2_r_in, p1_r_in, p0_r_in);
 
     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
@@ -408,8 +406,7 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
                                     const uint8_t *b_limit_ptr,
                                     const uint8_t *limit_ptr,
-                                    const uint8_t *thresh_ptr,
-                                    int32_t count) {
+                                    const uint8_t *thresh_ptr, int32_t count) {
   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
   uint8_t early_exit = 0;
 
@@ -426,8 +423,7 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
                                    const uint8_t *b_limit_ptr,
                                    const uint8_t *limit_ptr,
-                                   const uint8_t *thresh_ptr,
-                                   int32_t count) {
+                                   const uint8_t *thresh_ptr, int32_t count) {
   if (1 == count) {
     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
     uint64_t dword0, dword1;
@@ -449,8 +445,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
     b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
     limit = (v16u8)__msa_fill_b(*limit_ptr);
 
-    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-                 hev, mask, flat);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+                 mask, flat);
     VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
     VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
                        q1_out);
@@ -472,9 +468,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
                   p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
 
       /* convert 16 bit output data into 8 bit */
-      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
-                  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
-                  q0_filter8);
+      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+                  q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
       PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
 
       /* store pixel values */
@@ -668,8 +663,8 @@ static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
 
-  LD_UB8(input, in_pitch,
-         p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
+  LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
+         p1_org, p0_org);
   /* 8x8 transpose */
   TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
                      p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
@@ -699,8 +694,8 @@ static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
 }
 
-static void transpose_16x16(uint8_t *input, int32_t in_pitch,
-                            uint8_t *output, int32_t out_pitch) {
+static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
+                            int32_t out_pitch) {
   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -709,12 +704,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch,
 
   LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
   input += (8 * in_pitch);
-  LD_UB8(input, in_pitch,
-         row8, row9, row10, row11, row12, row13, row14, row15);
+  LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
 
-  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
-                      row8, row9, row10, row11, row12, row13, row14, row15,
-                      p7, p6, p5, p4, p3, p2, p1, p0);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
+                      p5, p4, p3, p2, p1, p0);
 
   /* transpose 16x8 matrix into 8x16 */
   /* total 8 intermediate register and 32 instructions */
@@ -779,8 +773,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
   limit = (v16u8)__msa_fill_b(*limit_ptr);
 
   /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   /* flat4 */
   VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
@@ -794,9 +788,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
     ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
     return 1;
   } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
-               q3_r);
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
@@ -864,9 +857,9 @@ int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
   } else {
     src -= 7 * 16;
 
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
-               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
-               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
 
     tmp0_r = p7_r_in << 3;
@@ -1056,9 +1049,9 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
 
   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 
-  early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
-                                       &filter48[0], src, pitch, b_limit_ptr,
-                                       limit_ptr, thresh_ptr);
+  early_exit =
+      vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
+                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
     early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
@@ -1093,8 +1086,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
   limit = (v16u8)__msa_fill_b(*limit_ptr);
 
   /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   /* flat4 */
   VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
@@ -1113,9 +1106,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
 
     return 1;
   } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
-               q3_r);
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
@@ -1196,9 +1188,9 @@ int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
   } else {
     src -= 7 * 16;
 
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
-               zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
-               p3_r_in, p2_r_in, p1_r_in, p0_r_in);
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
 
     tmp0_r = p7_r_in << 3;
@@ -1479,9 +1471,9 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
 
   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 
-  early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
-                                        &filter48[0], src, pitch, b_limit_ptr,
-                                        limit_ptr, thresh_ptr);
+  early_exit =
+      vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
     early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
index 5ea27ae6a743e5fb2fc231dcb748e3a436ae4370..fe216c2b6f938bf39c4cb8b9532c0e7b8ab0079d 100644
--- a/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c
@@ -25,8 +25,8 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
   limit = (v16u8)__msa_fill_b(*limit_ptr);
 
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
@@ -61,8 +61,8 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
   limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
   limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
 
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
   VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
 
   ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
@@ -82,10 +82,10 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
   limit = (v16u8)__msa_fill_b(*limit_ptr);
 
-  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
-                     p3, p2, p1, p0, q0, q1, q2, q3);
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
   ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
   ILVRL_H2_SH(vec1, vec0, vec2, vec3);
@@ -111,12 +111,12 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
 
   LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
-  LD_UB8(src - 4 + (8 * pitch), pitch,
-         row8, row9, row10, row11, row12, row13, row14, row15);
+  LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
+         row14, row15);
 
-  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
-                      row8, row9, row10, row11, row12, row13, row14, row15,
-                      p3, p2, p1, p0, q0, q1, q2, q3);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
 
   thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
   thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
@@ -130,8 +130,8 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
   limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
   limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
 
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
   VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
   ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
   ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
index 5d5dbd26cb7f5c4a847bdc0d82c93abc5fe274c4..af0d628fa36e7fdfd75ecd1c6d64e98c027bb32e 100644
--- a/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c
@@ -29,8 +29,8 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
   limit = (v16u8)__msa_fill_b(*limit_ptr);
 
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
@@ -43,16 +43,14 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
     q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
     SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
   } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
-               q2_r, q3_r);
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
                 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
 
     /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
-                zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
-                q0_filter8);
+    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+                q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
     PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
 
     /* store pixel values */
@@ -80,13 +78,10 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
   }
 }
 
-void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit0,
-                                   const uint8_t *limit0,
-                                   const uint8_t *thresh0,
-                                   const uint8_t *b_limit1,
-                                   const uint8_t *limit1,
-                                   const uint8_t *thresh1) {
+void vpx_lpf_horizontal_8_dual_msa(
+    uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
@@ -112,17 +107,16 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
   limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
 
   /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
 
   if (__msa_test_bz_v(flat)) {
     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
   } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
-               q2_r, q3_r);
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
@@ -170,16 +164,16 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
   /* load vector elements */
   LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
-  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
-                     p3, p2, p1, p0, q0, q1, q2, q3);
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
 
   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
   limit = (v16u8)__msa_fill_b(*limit_ptr);
 
   /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   /* flat4 */
   VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
@@ -197,9 +191,8 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
     src += 4 * pitch;
     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
   } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
-               q3_r);
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
     /* convert 16 bit output data into 8 bit */
@@ -232,11 +225,9 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
 }
 
 void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
-                                 const uint8_t *b_limit0,
-                                 const uint8_t *limit0,
+                                 const uint8_t *b_limit0, const uint8_t *limit0,
                                  const uint8_t *thresh0,
-                                 const uint8_t *b_limit1,
-                                 const uint8_t *limit1,
+                                 const uint8_t *b_limit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
   uint8_t *temp_src;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
@@ -257,9 +248,9 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
   LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
 
   /* transpose 16x8 matrix into 8x16 */
-  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
-                      q3, q2, q1, q0, row12, row13, row14, row15,
-                      p3, p2, p1, p0, q0, q1, q2, q3);
+  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+                      q3);
 
   thresh = (v16u8)__msa_fill_b(*thresh0);
   vec0 = (v8i16)__msa_fill_b(*thresh1);
@@ -274,8 +265,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
   limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
 
   /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
-               hev, mask, flat);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
   /* flat4 */
   VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat);
   /* filter4 */
@@ -292,9 +283,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
     src += 8 * pitch;
     ST4x8_UB(vec4, vec5, src, pitch);
   } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
-               q3_r);
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
 
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 8414b9ed53f840891913f388a66affe759dba73e..f1743679a7d690063f9daa23e40959c4bff009f6 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -19,33 +19,30 @@
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
-                                int pitch,
-                                const uint8_t *blimit,
-                                const uint8_t *limit,
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
+                                const uint8_t *blimit, const uint8_t *limit,
                                 const uint8_t *thresh) {
-  uint8_t   i;
-  uint32_t  mask;
-  uint32_t  hev;
-  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
-  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
-  uint32_t  thresh_vec, flimit_vec, limit_vec;
-  uint32_t  uflimit, ulimit, uthresh;
+  uint8_t i;
+  uint32_t mask;
+  uint32_t hev;
+  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
 
   uflimit = *blimit;
   ulimit = *limit;
   uthresh = *thresh;
 
   /* create quad-byte */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
 
-      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
-        [limit_vec] "=r" (limit_vec)
-      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
-  );
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 
   /* prefetch data for store */
   prefetch_store(s);
@@ -62,49 +59,44 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
     s5 = s4 + pitch;
     s6 = s5 + pitch;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lw     %[p1],  (%[s1])    \n\t"
         "lw     %[p2],  (%[s2])    \n\t"
         "lw     %[p3],  (%[s3])    \n\t"
         "lw     %[p4],  (%[s4])    \n\t"
 
-        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
-        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
-    );
+        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 
     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
        mask will be zero and filtering is not needed */
     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "lw       %[pm1], (%[sm1])   \n\t"
           "lw       %[p0],  (%[s0])    \n\t"
           "lw       %[p5],  (%[s5])    \n\t"
           "lw       %[p6],  (%[s6])    \n\t"
 
-          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
-            [p6] "=&r" (p6)
-          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
-      );
+          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
+          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
 
-      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
-                            pm1, p0, p3, p4, p5, p6,
-                            thresh_vec, &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+                            p6, thresh_vec, &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
         filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sw     %[p1],  (%[s1])    \n\t"
             "sw     %[p2],  (%[s2])    \n\t"
             "sw     %[p3],  (%[s3])    \n\t"
             "sw     %[p4],  (%[s4])    \n\t"
 
             :
-            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
-              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
-        );
+            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
+              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
       }
     }
 
@@ -112,33 +104,30 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
   }
 }
 
-void vpx_lpf_vertical_4_dspr2(unsigned char *s,
-                              int pitch,
-                              const uint8_t *blimit,
-                              const uint8_t *limit,
+void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
+                              const uint8_t *blimit, const uint8_t *limit,
                               const uint8_t *thresh) {
-  uint8_t   i;
-  uint32_t  mask, hev;
-  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
-  uint8_t   *s1, *s2, *s3, *s4;
-  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t  thresh_vec, flimit_vec, limit_vec;
-  uint32_t  uflimit, ulimit, uthresh;
+  uint8_t i;
+  uint32_t mask, hev;
+  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
 
   uflimit = *blimit;
   ulimit = *limit;
   uthresh = *thresh;
 
   /* create quad-byte */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
 
-      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
-        [limit_vec] "=r" (limit_vec)
-      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
-  );
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 
   /* prefetch data for store */
   prefetch_store(s + pitch);
@@ -148,22 +137,22 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
     s2 = s + pitch;
     s3 = s2 + pitch;
     s4 = s3 + pitch;
-    s  = s4 + pitch;
+    s = s4 + pitch;
 
     /* load quad-byte vectors
      * memory is 4 byte aligned
      */
-    p2  = *((uint32_t *)(s1 - 4));
-    p6  = *((uint32_t *)(s1));
-    p1  = *((uint32_t *)(s2 - 4));
-    p5  = *((uint32_t *)(s2));
-    p0  = *((uint32_t *)(s3 - 4));
-    p4  = *((uint32_t *)(s3));
+    p2 = *((uint32_t *)(s1 - 4));
+    p6 = *((uint32_t *)(s1));
+    p1 = *((uint32_t *)(s2 - 4));
+    p5 = *((uint32_t *)(s2));
+    p0 = *((uint32_t *)(s3 - 4));
+    p4 = *((uint32_t *)(s3));
     pm1 = *((uint32_t *)(s4 - 4));
-    p3  = *((uint32_t *)(s4));
+    p3 = *((uint32_t *)(s4));
 
     /* transpose pm1, p0, p1, p2 */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
@@ -179,15 +168,13 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
         "append         %[p1],      %[sec3],    16          \n\t"
         "append         %[pm1],     %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
     /* transpose p3, p4, p5, p6 */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
@@ -203,20 +190,17 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
         "append         %[p5],      %[sec3],    16          \n\t"
         "append         %[p3],      %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
      * mask will be zero and filtering is not needed
      */
     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
-                            p0, p3, p4, p5, p6, thresh_vec,
-                            &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+                            p6, thresh_vec, &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
@@ -227,107 +211,93 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
          * don't use transpose on output data
          * because memory isn't aligned
          */
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p4],   1(%[s4])    \n\t"
             "sb     %[p3],   0(%[s4])    \n\t"
             "sb     %[p2],  -1(%[s4])    \n\t"
             "sb     %[p1],  -2(%[s4])    \n\t"
 
             :
-            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
-              [s4] "r" (s4)
-        );
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s4] "r"(s4));
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "srl    %[p4],  %[p4],  8     \n\t"
             "srl    %[p3],  %[p3],  8     \n\t"
             "srl    %[p2],  %[p2],  8     \n\t"
             "srl    %[p1],  %[p1],  8     \n\t"
 
-            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
-            :
-        );
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p4],   1(%[s3])    \n\t"
             "sb     %[p3],   0(%[s3])    \n\t"
             "sb     %[p2],  -1(%[s3])    \n\t"
             "sb     %[p1],  -2(%[s3])    \n\t"
 
-            : [p1] "+r" (p1)
-            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
-        );
+            : [p1] "+r"(p1)
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "srl    %[p4],  %[p4],  8     \n\t"
             "srl    %[p3],  %[p3],  8     \n\t"
             "srl    %[p2],  %[p2],  8     \n\t"
             "srl    %[p1],  %[p1],  8     \n\t"
 
-            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
-            :
-        );
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p4],   1(%[s2])    \n\t"
             "sb     %[p3],   0(%[s2])    \n\t"
             "sb     %[p2],  -1(%[s2])    \n\t"
             "sb     %[p1],  -2(%[s2])    \n\t"
 
             :
-            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
-              [s2] "r" (s2)
-        );
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s2] "r"(s2));
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "srl    %[p4],  %[p4],  8     \n\t"
             "srl    %[p3],  %[p3],  8     \n\t"
             "srl    %[p2],  %[p2],  8     \n\t"
             "srl    %[p1],  %[p1],  8     \n\t"
 
-            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
-            :
-        );
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p4],   1(%[s1])    \n\t"
             "sb     %[p3],   0(%[s1])    \n\t"
             "sb     %[p2],  -1(%[s1])    \n\t"
             "sb     %[p1],  -2(%[s1])    \n\t"
 
             :
-            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
-              [s1] "r" (s1)
-        );
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s1] "r"(s1));
       }
     }
   }
 }
 
-void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
-                                     const uint8_t *blimit0,
-                                     const uint8_t *limit0,
-                                     const uint8_t *thresh0,
-                                     const uint8_t *blimit1,
-                                     const uint8_t *limit1,
-                                     const uint8_t *thresh1) {
+void vpx_lpf_horizontal_4_dual_dspr2(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
   vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
   vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
 }
 
-void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
-                                     const uint8_t *blimit0,
-                                     const uint8_t *limit0,
-                                     const uint8_t *thresh0,
-                                     const uint8_t *blimit1,
-                                     const uint8_t *limit1,
-                                     const uint8_t *thresh1) {
+void vpx_lpf_horizontal_8_dual_dspr2(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
   vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
   vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
 }
 
-void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
-                                   const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
@@ -337,8 +307,7 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
   vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
-void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
-                                   const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
@@ -348,8 +317,7 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
   vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
-void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
-                                    const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh) {
   vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.h b/vpx_dsp/mips/loopfilter_filters_dspr2.h
index db39854368cdcaa595df1ecd080d8db657917db1..11f286d281beb6f83290abc48f1c09ad7c6d980a 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.h
@@ -24,22 +24,21 @@ extern "C" {
 
 #if HAVE_DSPR2
 /* inputs & outputs are quad-byte vectors */
-static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
-                                uint32_t *ps1, uint32_t *ps0,
-                                uint32_t *qs0, uint32_t *qs1) {
-  int32_t   vpx_filter_l, vpx_filter_r;
-  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
-  int32_t   subr_r, subr_l;
-  uint32_t  t1, t2, HWM, t3;
-  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
-  int32_t   vps1, vps0, vqs0, vqs1;
-  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
-  uint32_t  N128;
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
+                                uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
+  int32_t vpx_filter_l, vpx_filter_r;
+  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t subr_r, subr_l;
+  uint32_t t1, t2, HWM, t3;
+  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t vps1, vps0, vqs0, vqs1;
+  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t N128;
 
   N128 = 0x80808080;
-  t1  = 0x03000300;
-  t2  = 0x04000400;
-  t3  = 0x01000100;
+  t1 = 0x03000300;
+  t2 = 0x04000400;
+  t3 = 0x01000100;
   HWM = 0xFF00FF00;
 
   vps0 = (*ps0) ^ N128;
@@ -72,7 +71,7 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
   hev_r = hev << 8;
   hev_r = hev_r & HWM;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
       "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
       "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
@@ -99,20 +98,17 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
       "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
       "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vpx_filter_l] "=&r" (vpx_filter_l),
-        [vpx_filter_r] "=&r" (vpx_filter_r),
-        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
-        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
-      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
-        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
-        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
-        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
-        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
-        [HWM] "r" (HWM)
-  );
+      : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
+        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+        [HWM] "r"(HWM));
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
       "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
       "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
@@ -137,15 +133,14 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
 
-      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
-        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
-        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
-        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
-      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
-  );
+      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+        [vqs0_r] "+r"(vqs0_r)
+      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+        [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
@@ -162,11 +157,10 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
-      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
-        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
-        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
-      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
-  );
+      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+        [vqs1_r] "+r"(vqs1_r)
+      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
 
   /* Create quad-bytes from halfword pairs */
   vqs0_l = vqs0_l & HWM;
@@ -174,16 +168,15 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
   vps0_l = vps0_l & HWM;
   vps1_l = vps1_l & HWM;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
 
-      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
-        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
-      :
-  );
+      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+        [vqs0_r] "+r"(vqs0_r)
+      :);
 
   vqs0 = vqs0_l | vqs0_r;
   vqs1 = vqs1_l | vqs1_r;
@@ -196,24 +189,23 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
   *qs1 = vqs1 ^ N128;
 }
 
-static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
-                                 uint32_t ps1, uint32_t ps0,
-                                 uint32_t qs0, uint32_t qs1,
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
+                                 uint32_t ps0, uint32_t qs0, uint32_t qs1,
                                  uint32_t *p1_f0, uint32_t *p0_f0,
                                  uint32_t *q0_f0, uint32_t *q1_f0) {
-  int32_t   vpx_filter_l, vpx_filter_r;
-  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
-  int32_t   subr_r, subr_l;
-  uint32_t  t1, t2, HWM, t3;
-  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
-  int32_t   vps1, vps0, vqs0, vqs1;
-  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
-  uint32_t  N128;
+  int32_t vpx_filter_l, vpx_filter_r;
+  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t subr_r, subr_l;
+  uint32_t t1, t2, HWM, t3;
+  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t vps1, vps0, vqs0, vqs1;
+  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t N128;
 
   N128 = 0x80808080;
-  t1  = 0x03000300;
-  t2  = 0x04000400;
-  t3  = 0x01000100;
+  t1 = 0x03000300;
+  t2 = 0x04000400;
+  t3 = 0x01000100;
   HWM = 0xFF00FF00;
 
   vps0 = (ps0) ^ N128;
@@ -246,7 +238,7 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
   hev_r = hev << 8;
   hev_r = hev_r & HWM;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
       "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
       "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
@@ -273,19 +265,17 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
       "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
       "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vpx_filter_l] "=&r" (vpx_filter_l),
-        [vpx_filter_r] "=&r" (vpx_filter_r),
-        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
-        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
-      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
-        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
-        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
-        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
-        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
-  );
+      : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r),
+        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+        [HWM] "r"(HWM));
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
       "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
       "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
@@ -310,15 +300,14 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
 
-      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
-        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
-        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
-        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
-      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
-  );
+      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+        [vqs0_r] "+r"(vqs0_r)
+      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+        [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r));
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
@@ -335,11 +324,10 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
-      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
-        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
-        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
-      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
-  );
+      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+        [vqs1_r] "+r"(vqs1_r)
+      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
 
   /* Create quad-bytes from halfword pairs */
   vqs0_l = vqs0_l & HWM;
@@ -347,16 +335,15 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
   vps0_l = vps0_l & HWM;
   vps1_l = vps1_l & HWM;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
 
-      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
-        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
-      :
-  );
+      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+        [vqs0_r] "+r"(vqs0_r)
+      :);
 
   vqs0 = vqs0_l | vqs0_r;
   vqs1 = vqs1_l | vqs1_r;
@@ -369,18 +356,17 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
   *q1_f0 = vqs1 ^ N128;
 }
 
-static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
-                                  uint32_t *op1, uint32_t *op0,
-                                  uint32_t *oq0, uint32_t *oq1,
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
+                                  uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
                                   uint32_t *oq2, uint32_t *oq3) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-  uint32_t       res_op2, res_op1, res_op0;
-  uint32_t       res_oq0, res_oq1, res_oq2;
-  uint32_t       tmp;
-  uint32_t       add_p210_q012;
-  uint32_t       u32Four = 0x00040004;
+  uint32_t res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2;
+  uint32_t tmp;
+  uint32_t add_p210_q012;
+  uint32_t u32Four = 0x00040004;
 
   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
@@ -389,7 +375,7 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
@@ -428,15 +414,12 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
       "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
       "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
 
-      : [add_p210_q012] "=&r" (add_p210_q012),
-        [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
-        [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
-        [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
-        [res_oq2] "=&r" (res_oq2)
-      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
-        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
-        [u32Four] "r" (u32Four)
-  );
+      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
 
   *op2 = res_op2;
   *op1 = res_op1;
@@ -446,20 +429,18 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
   *oq2 = res_oq2;
 }
 
-static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
-                                   uint32_t p1, uint32_t p0,
-                                   uint32_t q0, uint32_t q1,
-                                   uint32_t q2, uint32_t q3,
-                                   uint32_t *op2_f1,
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
+                                   uint32_t p0, uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3, uint32_t *op2_f1,
                                    uint32_t *op1_f1, uint32_t *op0_f1,
                                    uint32_t *oq0_f1, uint32_t *oq1_f1,
                                    uint32_t *oq2_f1) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
-  uint32_t  res_op2, res_op1, res_op0;
-  uint32_t  res_oq0, res_oq1, res_oq2;
-  uint32_t  tmp;
-  uint32_t  add_p210_q012;
-  uint32_t  u32Four = 0x00040004;
+  uint32_t res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2;
+  uint32_t tmp;
+  uint32_t add_p210_q012;
+  uint32_t u32Four = 0x00040004;
 
   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
@@ -468,7 +449,7 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
@@ -507,14 +488,12 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
       "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
       "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
 
-      : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
-        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
-        [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
-        [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
-      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
-        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
-        [u32Four] "r" (u32Four)
-  );
+      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
 
   *op2_f1 = res_op2;
   *op1_f1 = res_op1;
@@ -524,25 +503,22 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
   *oq2_f1 = res_oq2;
 }
 
-static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
-                                       uint32_t *op5, uint32_t *op4,
-                                       uint32_t *op3, uint32_t *op2,
-                                       uint32_t *op1, uint32_t *op0,
-                                       uint32_t *oq0, uint32_t *oq1,
-                                       uint32_t *oq2, uint32_t *oq3,
-                                       uint32_t *oq4, uint32_t *oq5,
-                                       uint32_t *oq6, uint32_t *oq7) {
+static INLINE void wide_mbfilter_dspr2(
+    uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
+    uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+    uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
+    uint32_t *oq7) {
   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
   const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
-  uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
-  uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
-  uint32_t       tmp;
-  uint32_t       add_p6toq6;
-  uint32_t       u32Eight = 0x00080008;
+  uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+  uint32_t tmp;
+  uint32_t add_p6toq6;
+  uint32_t u32Eight = 0x00080008;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
          which is used most of the time */
       "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
@@ -560,15 +536,13 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
 
-      : [add_p6toq6] "=&r" (add_p6toq6)
-      : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
-        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
-        [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
-        [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
-        [u32Eight] "r" (u32Eight)
-  );
+      : [add_p6toq6] "=&r"(add_p6toq6)
+      : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
+        [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
+        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+        [u32Eight] "r"(u32Eight));
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
                                    p3 + p2 + p1 + p0 + q0, 4) */
       "shll.ph       %[tmp],            %[p7],            3               \n\t"
@@ -643,16 +617,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
       "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
       "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
 
-      : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
-        [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
-        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
-        [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
-      : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
-        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
-        [q2] "r" (q2), [q1] "r" (q1),
-        [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
-        [add_p6toq6] "r" (add_p6toq6)
-  );
+      : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
+        [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
+      : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+        [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
+        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+        [add_p6toq6] "r"(add_p6toq6));
 
   *op6 = res_op6;
   *op5 = res_op5;
@@ -662,7 +634,7 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
   *op1 = res_op1;
   *op0 = res_op0;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
                                    q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
       "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
@@ -737,16 +709,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
       "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
       "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
 
-      : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
-        [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
-        [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
-        [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
-      : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
-        [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
-        [p1] "r" (p1), [p2] "r" (p2),
-        [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
-        [add_p6toq6] "r" (add_p6toq6)
-  );
+      : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
+        [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
+        [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
+        [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
+      : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+        [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
+        [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
+        [add_p6toq6] "r"(add_p6toq6));
 
   *oq0 = res_oq0;
   *oq1 = res_oq1;
diff --git a/vpx_dsp/mips/loopfilter_macros_dspr2.h b/vpx_dsp/mips/loopfilter_macros_dspr2.h
index a990b4061bab30490803a13b6a7f4d0aff37fa5c..769371dff8aa2e4f8ee23af309a523a69bb7d2c1 100644
--- a/vpx_dsp/mips/loopfilter_macros_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_macros_dspr2.h
@@ -22,453 +22,410 @@ extern "C" {
 #endif
 
 #if HAVE_DSPR2
-#define STORE_F0() {                                                    \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q1_f0],    1(%[s4])           \n\t"                   \
-        "sb     %[q0_f0],    0(%[s4])           \n\t"                   \
-        "sb     %[p0_f0],   -1(%[s4])           \n\t"                   \
-        "sb     %[p1_f0],   -2(%[s4])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
-          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
-          [s4] "r" (s4)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
-                                                                        \
-        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
-          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
-        :                                                               \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q1_f0],    1(%[s3])           \n\t"                   \
-        "sb     %[q0_f0],    0(%[s3])           \n\t"                   \
-        "sb     %[p0_f0],   -1(%[s3])           \n\t"                   \
-        "sb     %[p1_f0],   -2(%[s3])           \n\t"                   \
-                                                                        \
-        : [p1_f0] "+r" (p1_f0)                                          \
-        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
-          [s3] "r" (s3), [p0_f0] "r" (p0_f0)                            \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
-                                                                        \
-        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
-          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
-        :                                                               \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q1_f0],    1(%[s2])           \n\t"                   \
-        "sb     %[q0_f0],    0(%[s2])           \n\t"                   \
-        "sb     %[p0_f0],   -1(%[s2])           \n\t"                   \
-        "sb     %[p1_f0],   -2(%[s2])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
-          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
-          [s2] "r" (s2)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
-                                                                        \
-        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
-          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
-        :                                                               \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q1_f0],    1(%[s1])           \n\t"                   \
-        "sb     %[q0_f0],    0(%[s1])           \n\t"                   \
-        "sb     %[p0_f0],   -1(%[s1])           \n\t"                   \
-        "sb     %[p1_f0],   -2(%[s1])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
-          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
-          [s1] "r" (s1)                                                 \
-    );                                                                  \
-}
+#define STORE_F0()                                                       \
+  {                                                                      \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s4])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s4])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s4])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s4])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s4] "r"(s4));                             \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s3])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s3])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s3])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s3])           \n\t"                    \
+                                                                         \
+        : [p1_f0] "+r"(p1_f0)                                            \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3),          \
+          [p0_f0] "r"(p0_f0));                                           \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s2])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s2])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s2])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s2])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s2] "r"(s2));                             \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s1])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s1])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s1])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s1])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s1] "r"(s1));                             \
+  }
 
-#define STORE_F1() {                                                    \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
-        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
-        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
-        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
-        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
-        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
-          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
-          [s4] "r" (s4)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
-        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
-        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
-        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
-        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
-        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
-                                                                        \
-        : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r),   \
-          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r)    \
-        :                                                               \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
-        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
-        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
-        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
-        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
-        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
-          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
-          [s3] "r" (s3)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
-        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
-        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
-        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
-        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
-        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
-          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
-          [s2] "r" (s2)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "srl    %[q2_l],    %[q2_l],    16      \n\t"                   \
-        "srl    %[q1_l],    %[q1_l],    16      \n\t"                   \
-        "srl    %[q0_l],    %[q0_l],    16      \n\t"                   \
-        "srl    %[p0_l],    %[p0_l],    16      \n\t"                   \
-        "srl    %[p1_l],    %[p1_l],    16      \n\t"                   \
-        "srl    %[p2_l],    %[p2_l],    16      \n\t"                   \
-                                                                        \
-        : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l),   \
-          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l)    \
-        :                                                               \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
-        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
-        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
-        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
-        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
-        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
-          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
-          [s1] "r" (s1)                                                 \
-    );                                                                  \
-}
+#define STORE_F1()                                                             \
+  {                                                                            \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                          \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                          \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                          \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                          \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                          \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
+          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                          \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                          \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                          \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                          \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                          \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                          \
+                                                                               \
+        : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r),             \
+          [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r)              \
+        :);                                                                    \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                          \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                          \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                          \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                          \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                          \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
+          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                          \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                          \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                          \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                          \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                          \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
+          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "srl    %[q2_l],    %[q2_l],    16      \n\t"                          \
+        "srl    %[q1_l],    %[q1_l],    16      \n\t"                          \
+        "srl    %[q0_l],    %[q0_l],    16      \n\t"                          \
+        "srl    %[p0_l],    %[p0_l],    16      \n\t"                          \
+        "srl    %[p1_l],    %[p1_l],    16      \n\t"                          \
+        "srl    %[p2_l],    %[p2_l],    16      \n\t"                          \
+                                                                               \
+        : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l),             \
+          [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l)              \
+        :);                                                                    \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                          \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                          \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                          \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                          \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                          \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
+          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
+  }
 
-#define STORE_F2() {                                                    \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q6_r],     6(%[s4])           \n\t"                   \
-        "sb     %[q5_r],     5(%[s4])           \n\t"                   \
-        "sb     %[q4_r],     4(%[s4])           \n\t"                   \
-        "sb     %[q3_r],     3(%[s4])           \n\t"                   \
-        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
-        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
-        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
-        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
-        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
-        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
-        "sb     %[p3_r],    -4(%[s4])           \n\t"                   \
-        "sb     %[p4_r],    -5(%[s4])           \n\t"                   \
-        "sb     %[p5_r],    -6(%[s4])           \n\t"                   \
-        "sb     %[p6_r],    -7(%[s4])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
-          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
-          [q0_r] "r" (q0_r),                                            \
-          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
-          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
-          [p6_r] "r" (p6_r),                                            \
-          [s4] "r" (s4)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "srl    %[q6_r],    %[q6_r],    16      \n\t"                   \
-        "srl    %[q5_r],    %[q5_r],    16      \n\t"                   \
-        "srl    %[q4_r],    %[q4_r],    16      \n\t"                   \
-        "srl    %[q3_r],    %[q3_r],    16      \n\t"                   \
-        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
-        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
-        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
-        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
-        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
-        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
-        "srl    %[p3_r],    %[p3_r],    16      \n\t"                   \
-        "srl    %[p4_r],    %[p4_r],    16      \n\t"                   \
-        "srl    %[p5_r],    %[p5_r],    16      \n\t"                   \
-        "srl    %[p6_r],    %[p6_r],    16      \n\t"                   \
-                                                                        \
-        : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r),   \
-          [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r),   \
-          [q0_r] "+r" (q0_r),                                           \
-          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r),   \
-          [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r),   \
-          [p6_r] "+r" (p6_r)                                            \
-        :                                                               \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q6_r],     6(%[s3])           \n\t"                   \
-        "sb     %[q5_r],     5(%[s3])           \n\t"                   \
-        "sb     %[q4_r],     4(%[s3])           \n\t"                   \
-        "sb     %[q3_r],     3(%[s3])           \n\t"                   \
-        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
-        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
-        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
-        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
-        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
-        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
-        "sb     %[p3_r],    -4(%[s3])           \n\t"                   \
-        "sb     %[p4_r],    -5(%[s3])           \n\t"                   \
-        "sb     %[p5_r],    -6(%[s3])           \n\t"                   \
-        "sb     %[p6_r],    -7(%[s3])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
-          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
-          [q0_r] "r" (q0_r),                                            \
-          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
-          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
-          [p6_r] "r" (p6_r),                                            \
-          [s3] "r" (s3)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q6_l],     6(%[s2])           \n\t"                   \
-        "sb     %[q5_l],     5(%[s2])           \n\t"                   \
-        "sb     %[q4_l],     4(%[s2])           \n\t"                   \
-        "sb     %[q3_l],     3(%[s2])           \n\t"                   \
-        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
-        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
-        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
-        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
-        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
-        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
-        "sb     %[p3_l],    -4(%[s2])           \n\t"                   \
-        "sb     %[p4_l],    -5(%[s2])           \n\t"                   \
-        "sb     %[p5_l],    -6(%[s2])           \n\t"                   \
-        "sb     %[p6_l],    -7(%[s2])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
-          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
-          [q0_l] "r" (q0_l),                                            \
-          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
-          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
-          [p6_l] "r" (p6_l),                                            \
-          [s2] "r" (s2)                                                 \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "srl    %[q6_l],    %[q6_l],    16     \n\t"                    \
-        "srl    %[q5_l],    %[q5_l],    16     \n\t"                    \
-        "srl    %[q4_l],    %[q4_l],    16     \n\t"                    \
-        "srl    %[q3_l],    %[q3_l],    16     \n\t"                    \
-        "srl    %[q2_l],    %[q2_l],    16     \n\t"                    \
-        "srl    %[q1_l],    %[q1_l],    16     \n\t"                    \
-        "srl    %[q0_l],    %[q0_l],    16     \n\t"                    \
-        "srl    %[p0_l],    %[p0_l],    16     \n\t"                    \
-        "srl    %[p1_l],    %[p1_l],    16     \n\t"                    \
-        "srl    %[p2_l],    %[p2_l],    16     \n\t"                    \
-        "srl    %[p3_l],    %[p3_l],    16     \n\t"                    \
-        "srl    %[p4_l],    %[p4_l],    16     \n\t"                    \
-        "srl    %[p5_l],    %[p5_l],    16     \n\t"                    \
-        "srl    %[p6_l],    %[p6_l],    16     \n\t"                    \
-                                                                        \
-        : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l),   \
-          [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l),   \
-          [q0_l] "+r" (q0_l),                                           \
-          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l),   \
-          [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l),   \
-          [p6_l] "+r" (p6_l)                                            \
-        :                                                               \
-    );                                                                  \
-                                                                        \
-    __asm__ __volatile__ (                                              \
-        "sb     %[q6_l],     6(%[s1])           \n\t"                   \
-        "sb     %[q5_l],     5(%[s1])           \n\t"                   \
-        "sb     %[q4_l],     4(%[s1])           \n\t"                   \
-        "sb     %[q3_l],     3(%[s1])           \n\t"                   \
-        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
-        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
-        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
-        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
-        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
-        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
-        "sb     %[p3_l],    -4(%[s1])           \n\t"                   \
-        "sb     %[p4_l],    -5(%[s1])           \n\t"                   \
-        "sb     %[p5_l],    -6(%[s1])           \n\t"                   \
-        "sb     %[p6_l],    -7(%[s1])           \n\t"                   \
-                                                                        \
-        :                                                               \
-        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
-          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
-          [q0_l] "r" (q0_l),                                            \
-          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
-          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
-          [p6_l] "r" (p6_l),                                            \
-          [s1] "r" (s1)                                                 \
-    );                                                                  \
-}
+#define STORE_F2()                                                 \
+  {                                                                \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_r],     6(%[s4])           \n\t"              \
+        "sb     %[q5_r],     5(%[s4])           \n\t"              \
+        "sb     %[q4_r],     4(%[s4])           \n\t"              \
+        "sb     %[q3_r],     3(%[s4])           \n\t"              \
+        "sb     %[q2_r],     2(%[s4])           \n\t"              \
+        "sb     %[q1_r],     1(%[s4])           \n\t"              \
+        "sb     %[q0_r],     0(%[s4])           \n\t"              \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"              \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"              \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"              \
+        "sb     %[p3_r],    -4(%[s4])           \n\t"              \
+        "sb     %[p4_r],    -5(%[s4])           \n\t"              \
+        "sb     %[p5_r],    -6(%[s4])           \n\t"              \
+        "sb     %[p6_r],    -7(%[s4])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
+          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
+          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
+          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
+          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "srl    %[q6_r],    %[q6_r],    16      \n\t"              \
+        "srl    %[q5_r],    %[q5_r],    16      \n\t"              \
+        "srl    %[q4_r],    %[q4_r],    16      \n\t"              \
+        "srl    %[q3_r],    %[q3_r],    16      \n\t"              \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"              \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"              \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"              \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"              \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"              \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"              \
+        "srl    %[p3_r],    %[p3_r],    16      \n\t"              \
+        "srl    %[p4_r],    %[p4_r],    16      \n\t"              \
+        "srl    %[p5_r],    %[p5_r],    16      \n\t"              \
+        "srl    %[p6_r],    %[p6_r],    16      \n\t"              \
+                                                                   \
+        : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
+          [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
+          [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
+          [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
+          [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r)                     \
+        :);                                                        \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_r],     6(%[s3])           \n\t"              \
+        "sb     %[q5_r],     5(%[s3])           \n\t"              \
+        "sb     %[q4_r],     4(%[s3])           \n\t"              \
+        "sb     %[q3_r],     3(%[s3])           \n\t"              \
+        "sb     %[q2_r],     2(%[s3])           \n\t"              \
+        "sb     %[q1_r],     1(%[s3])           \n\t"              \
+        "sb     %[q0_r],     0(%[s3])           \n\t"              \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"              \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"              \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"              \
+        "sb     %[p3_r],    -4(%[s3])           \n\t"              \
+        "sb     %[p4_r],    -5(%[s3])           \n\t"              \
+        "sb     %[p5_r],    -6(%[s3])           \n\t"              \
+        "sb     %[p6_r],    -7(%[s3])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
+          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
+          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
+          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
+          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_l],     6(%[s2])           \n\t"              \
+        "sb     %[q5_l],     5(%[s2])           \n\t"              \
+        "sb     %[q4_l],     4(%[s2])           \n\t"              \
+        "sb     %[q3_l],     3(%[s2])           \n\t"              \
+        "sb     %[q2_l],     2(%[s2])           \n\t"              \
+        "sb     %[q1_l],     1(%[s2])           \n\t"              \
+        "sb     %[q0_l],     0(%[s2])           \n\t"              \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"              \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"              \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"              \
+        "sb     %[p3_l],    -4(%[s2])           \n\t"              \
+        "sb     %[p4_l],    -5(%[s2])           \n\t"              \
+        "sb     %[p5_l],    -6(%[s2])           \n\t"              \
+        "sb     %[p6_l],    -7(%[s2])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
+          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
+          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
+          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
+          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "srl    %[q6_l],    %[q6_l],    16     \n\t"               \
+        "srl    %[q5_l],    %[q5_l],    16     \n\t"               \
+        "srl    %[q4_l],    %[q4_l],    16     \n\t"               \
+        "srl    %[q3_l],    %[q3_l],    16     \n\t"               \
+        "srl    %[q2_l],    %[q2_l],    16     \n\t"               \
+        "srl    %[q1_l],    %[q1_l],    16     \n\t"               \
+        "srl    %[q0_l],    %[q0_l],    16     \n\t"               \
+        "srl    %[p0_l],    %[p0_l],    16     \n\t"               \
+        "srl    %[p1_l],    %[p1_l],    16     \n\t"               \
+        "srl    %[p2_l],    %[p2_l],    16     \n\t"               \
+        "srl    %[p3_l],    %[p3_l],    16     \n\t"               \
+        "srl    %[p4_l],    %[p4_l],    16     \n\t"               \
+        "srl    %[p5_l],    %[p5_l],    16     \n\t"               \
+        "srl    %[p6_l],    %[p6_l],    16     \n\t"               \
+                                                                   \
+        : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
+          [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
+          [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
+          [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
+          [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l)                     \
+        :);                                                        \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_l],     6(%[s1])           \n\t"              \
+        "sb     %[q5_l],     5(%[s1])           \n\t"              \
+        "sb     %[q4_l],     4(%[s1])           \n\t"              \
+        "sb     %[q3_l],     3(%[s1])           \n\t"              \
+        "sb     %[q2_l],     2(%[s1])           \n\t"              \
+        "sb     %[q1_l],     1(%[s1])           \n\t"              \
+        "sb     %[q0_l],     0(%[s1])           \n\t"              \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"              \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"              \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"              \
+        "sb     %[p3_l],    -4(%[s1])           \n\t"              \
+        "sb     %[p4_l],    -5(%[s1])           \n\t"              \
+        "sb     %[p5_l],    -6(%[s1])           \n\t"              \
+        "sb     %[p6_l],    -7(%[s1])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
+          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
+          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
+          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
+          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1));       \
+  }
 
-#define PACK_LEFT_0TO3() {                                              \
-    __asm__ __volatile__ (                                              \
-        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                       \
-        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                       \
-        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                       \
-        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                       \
-        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                       \
-        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                       \
-        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                       \
-        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                       \
-                                                                        \
-        : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l),                     \
-          [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l),                     \
-          [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l),                     \
-          [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l)                      \
-        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
-          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
-    );                                                                  \
-}
+#define PACK_LEFT_0TO3()                                              \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                     \
+        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                     \
+        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                     \
+        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                     \
+        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                     \
+        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                     \
+        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                     \
+        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                     \
+                                                                      \
+        : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
+          [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
+          [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l)                      \
+        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
+          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
+  }
 
-#define PACK_LEFT_4TO7() {                                              \
-    __asm__ __volatile__ (                                              \
-        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                       \
-        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                       \
-        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                       \
-        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                       \
-        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                       \
-        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                       \
-        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                       \
-        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                       \
-                                                                        \
-        : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l),                     \
-          [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l),                     \
-          [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l),                     \
-          [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l)                      \
-        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
-          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
-    );                                                                  \
-}
+#define PACK_LEFT_4TO7()                                              \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                     \
+        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                     \
+        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                     \
+        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                     \
+        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                     \
+        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                     \
+        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                     \
+        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                     \
+                                                                      \
+        : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
+          [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
+          [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l)                      \
+        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
+          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
+  }
 
-#define PACK_RIGHT_0TO3() {                                             \
-    __asm__ __volatile__ (                                              \
-        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                        \
-        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                       \
-        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                       \
-        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                       \
-        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                       \
-        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                       \
-        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                       \
-        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                       \
-                                                                        \
-        : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r),                     \
-          [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r),                     \
-          [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r),                     \
-          [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r)                      \
-        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
-          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
-    );                                                                  \
-}
+#define PACK_RIGHT_0TO3()                                             \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                      \
+        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                     \
+        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                     \
+        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                     \
+        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                     \
+        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                     \
+        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                     \
+        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                     \
+                                                                      \
+        : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
+          [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
+          [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r)                      \
+        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
+          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
+  }
 
-#define PACK_RIGHT_4TO7() {                                             \
-    __asm__ __volatile__ (                                              \
-        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                       \
-        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                       \
-        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                       \
-        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                       \
-        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                       \
-        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                       \
-        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                       \
-        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                       \
-                                                                        \
-        : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r),                     \
-          [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r),                     \
-          [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r),                     \
-          [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r)                      \
-        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
-          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
-    );                                                                  \
-}
+#define PACK_RIGHT_4TO7()                                             \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                     \
+        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                     \
+        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                     \
+        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                     \
+        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                     \
+        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                     \
+        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                     \
+        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                     \
+                                                                      \
+        : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
+          [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
+          [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r)                      \
+        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
+          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
+  }
 
-#define COMBINE_LEFT_RIGHT_0TO2() {                                     \
-    __asm__ __volatile__ (                                              \
-        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"            \
-        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"            \
-        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"            \
-        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"            \
-        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"            \
-        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"            \
-                                                                        \
-        : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),            \
-          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2)             \
-        : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r),                         \
-          [p1_l] "r" (p1_l), [p1_r] "r" (p1_r),                         \
-          [p0_l] "r" (p0_l), [p0_r] "r" (p0_r),                         \
-          [q0_l] "r" (q0_l), [q0_r] "r" (q0_r),                         \
-          [q1_l] "r" (q1_l), [q1_r] "r" (q1_r),                         \
-          [q2_l] "r" (q2_l), [q2_r] "r" (q2_r)                          \
-    );                                                                  \
-}
+#define COMBINE_LEFT_RIGHT_0TO2()                                         \
+  {                                                                       \
+    __asm__ __volatile__(                                                 \
+        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"              \
+        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"              \
+        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"              \
+        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"              \
+        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"              \
+        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"              \
+                                                                          \
+        : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
+          [q1] "=&r"(q1), [q2] "=&r"(q2)                                  \
+        : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l),           \
+          [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r),           \
+          [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l),           \
+          [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r));          \
+  }
 
-#define COMBINE_LEFT_RIGHT_3TO6() {                                     \
-    __asm__ __volatile__ (                                              \
-        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"            \
-        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"            \
-        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"            \
-        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"            \
-        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"            \
-        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"            \
-        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"            \
-        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"            \
-                                                                        \
-        : [p6] "=&r" (p6),[p5] "=&r" (p5),                              \
-          [p4] "=&r" (p4),[p3] "=&r" (p3),                              \
-          [q3] "=&r" (q3),[q4] "=&r" (q4),                              \
-          [q5] "=&r" (q5),[q6] "=&r" (q6)                               \
-        : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l),                         \
-          [p4_l] "r" (p4_l), [p3_l] "r" (p3_l),                         \
-          [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),                         \
-          [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),                         \
-          [q3_l] "r" (q3_l), [q4_l] "r" (q4_l),                         \
-          [q5_l] "r" (q5_l), [q6_l] "r" (q6_l),                         \
-          [q3_r] "r" (q3_r), [q4_r] "r" (q4_r),                         \
-          [q5_r] "r" (q5_r), [q6_r] "r" (q6_r)                          \
-    );                                                                  \
-}
+#define COMBINE_LEFT_RIGHT_3TO6()                                         \
+  {                                                                       \
+    __asm__ __volatile__(                                                 \
+        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"              \
+        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"              \
+        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"              \
+        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"              \
+        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"              \
+        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"              \
+        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"              \
+        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"              \
+                                                                          \
+        : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
+          [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6)  \
+        : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),           \
+          [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r),           \
+          [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l),           \
+          [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l),           \
+          [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),           \
+          [q6_r] "r"(q6_r));                                              \
+  }
 
 #endif  // #if HAVE_DSPR2
 #ifdef __cplusplus
diff --git a/vpx_dsp/mips/loopfilter_masks_dspr2.h b/vpx_dsp/mips/loopfilter_masks_dspr2.h
index 9bf292705a62d50b2d112cec0c5592f7ec6e1c68..0a0cf577e332ead7d31826ef8f226ab339db7467 100644
--- a/vpx_dsp/mips/loopfilter_masks_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_masks_dspr2.h
@@ -25,18 +25,17 @@ extern "C" {
 /* processing 4 pixels at the same time
  * compute hev and mask in the same function */
 static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
-                                         uint32_t p1, uint32_t p0,
-                                         uint32_t p3, uint32_t p2,
-                                         uint32_t q0, uint32_t q1,
+                                         uint32_t p1, uint32_t p0, uint32_t p3,
+                                         uint32_t p2, uint32_t q0, uint32_t q1,
                                          uint32_t q2, uint32_t q3,
                                          uint32_t thresh, uint32_t *hev,
                                          uint32_t *mask) {
-  uint32_t  c, r, r3, r_k;
-  uint32_t  s1, s2, s3;
-  uint32_t  ones = 0xFFFFFFFF;
-  uint32_t  hev1;
+  uint32_t c, r, r3, r_k;
+  uint32_t s1, s2, s3;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t hev1;
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* mask |= (abs(p3 - p2) > limit) */
       "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
       "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
@@ -88,14 +87,12 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
       "or             %[r],   %[r],      %[c]         \n\t"
 
-      : [c] "=&r" (c), [r_k] "=&r" (r_k),
-        [r] "=&r" (r), [r3] "=&r" (r3)
-      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
-        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
-        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
-  );
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
+      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+        [thresh] "r"(thresh));
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* abs(p0 - q0) */
       "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
       "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
@@ -119,34 +116,27 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
       "wrdsp          %[r]                            \n\t"
       "pick.qb        %[s2],  $0,         %[ones]     \n\t"
 
-      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
-        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
-      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
-        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
-  );
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+        [ones] "r"(ones), [flimit] "r"(flimit));
 
   *hev = hev1;
   *mask = s2;
 }
 
-static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
-                                                   uint32_t flimit,
-                                                   uint32_t thresh,
-                                                   uint32_t p1, uint32_t p0,
-                                                   uint32_t p3, uint32_t p2,
-                                                   uint32_t q0, uint32_t q1,
-                                                   uint32_t q2, uint32_t q3,
-                                                   uint32_t *hev,
-                                                   uint32_t *mask,
-                                                   uint32_t *flat) {
-  uint32_t  c, r, r3, r_k, r_flat;
-  uint32_t  s1, s2, s3;
-  uint32_t  ones = 0xFFFFFFFF;
-  uint32_t  flat_thresh = 0x01010101;
-  uint32_t  hev1;
-  uint32_t  flat1;
-
-  __asm__ __volatile__ (
+static INLINE void filter_hev_mask_flatmask4_dspr2(
+    uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
+    uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
+    uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
+  uint32_t c, r, r3, r_k, r_flat;
+  uint32_t s1, s2, s3;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t flat_thresh = 0x01010101;
+  uint32_t hev1;
+  uint32_t flat1;
+
+  __asm__ __volatile__(
       /* mask |= (abs(p3 - p2) > limit) */
       "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
       "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
@@ -236,15 +226,13 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
       "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
       "or             %[r],       %[r],           %[c]         \n\t"
 
-      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
-        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
-      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
-        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
-        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
-        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
-  );
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
+        [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
+      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+        [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
 
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       /* abs(p0 - q0) */
       "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
       "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
@@ -268,29 +256,25 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
       "wrdsp          %[r]                            \n\t"
       "pick.qb        %[s2],   $0,        %[ones]     \n\t"
 
-      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
-        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
-      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
-        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
-  );
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+        [ones] "r"(ones), [flimit] "r"(flimit));
 
   *hev = hev1;
   *mask = s2;
   *flat = flat1;
 }
 
-static INLINE void flatmask5(uint32_t p4, uint32_t p3,
-                             uint32_t p2, uint32_t p1,
-                             uint32_t p0, uint32_t q0,
-                             uint32_t q1, uint32_t q2,
-                             uint32_t q3, uint32_t q4,
-                             uint32_t *flat2) {
-  uint32_t  c, r, r_k, r_flat;
-  uint32_t  ones = 0xFFFFFFFF;
-  uint32_t  flat_thresh = 0x01010101;
-  uint32_t  flat1, flat3;
-
-  __asm__ __volatile__ (
+static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4, uint32_t *flat2) {
+  uint32_t c, r, r_k, r_flat;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t flat_thresh = 0x01010101;
+  uint32_t flat1, flat3;
+
+  __asm__ __volatile__(
       /* flat |= (abs(p4 - p0) > thresh) */
       "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
       "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
@@ -355,13 +339,11 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3,
       /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
       "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
 
-      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r),
-        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
-      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
-        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
-        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
-        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
-  );
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
+        [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
+      : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
+        [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
+        [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
 
   *flat2 = flat1;
 }
diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
index dd0545eed23c71ea10c93f91c29e69aa8ff1f2db..e42479257c322e5330921286cecba20550ef8500 100644
--- a/vpx_dsp/mips/loopfilter_mb_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c
@@ -19,36 +19,33 @@
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
-                                int pitch,
-                                const uint8_t *blimit,
-                                const uint8_t *limit,
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
+                                const uint8_t *blimit, const uint8_t *limit,
                                 const uint8_t *thresh) {
-  uint32_t  mask;
-  uint32_t  hev, flat;
-  uint8_t   i;
-  uint8_t   *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
-  uint32_t  thresh_vec, flimit_vec, limit_vec;
-  uint32_t  uflimit, ulimit, uthresh;
-  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t  p3, p2, p1, p0, q0, q1, q2, q3;
-  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
-  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+  uint32_t mask;
+  uint32_t hev, flat;
+  uint8_t i;
+  uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
+  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
 
   uflimit = *blimit;
-  ulimit  = *limit;
+  ulimit = *limit;
   uthresh = *thresh;
 
   /* create quad-byte */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
       "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
       "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
 
-      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
-        [limit_vec] "=r" (limit_vec)
-      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
-  );
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 
   /* prefetch data for store */
   prefetch_store(s);
@@ -63,7 +60,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
     sq2 = sq1 + pitch;
     sq3 = sq2 + pitch;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lw     %[p3],      (%[sp3])    \n\t"
         "lw     %[p2],      (%[sp2])    \n\t"
         "lw     %[p1],      (%[sp1])    \n\t"
@@ -73,46 +70,39 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
         "lw     %[q2],      (%[sq2])    \n\t"
         "lw     %[q3],      (%[sq3])    \n\t"
 
-        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
-          [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
-        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-          [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
-    );
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
+        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+          [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
 
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                    p1, p0, p3, p2, q0, q1, q2, q3,
-                                    &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "sw       %[p1_f0],   (%[sp1])    \n\t"
           "sw       %[p0_f0],   (%[sp0])    \n\t"
           "sw       %[q0_f0],   (%[sq0])    \n\t"
           "sw       %[q1_f0],   (%[sq1])    \n\t"
 
           :
-          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-            [sp1] "r" (sp1), [sp0] "r" (sp0),
-            [sq0] "r" (sq0), [sq1] "r" (sq1)
-      );
+          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+            [sq1] "r"(sq1));
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "sw       %[p2],      (%[sp2])    \n\t"
           "sw       %[p1],      (%[sp1])    \n\t"
           "sw       %[p0],      (%[sp0])    \n\t"
@@ -121,28 +111,23 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
           "sw       %[q2],      (%[sq2])    \n\t"
 
           :
-          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
-            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
-            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-      );
+          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
     } else if ((flat != 0) && (mask != 0)) {
       /* filtering */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_r],    (%[sp2])    \n\t"
             "sb     %[p1_r],    (%[sp1])    \n\t"
             "sb     %[p0_r],    (%[sp0])    \n\t"
@@ -151,27 +136,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
             "sb     %[q2_r],    (%[sq2])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  (%[sp1])    \n\t"
             "sb         %[p0_f0],  (%[sp0])    \n\t"
             "sb         %[q0_f0],  (%[sq0])    \n\t"
             "sb         %[q1_f0],  (%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_r],    %[p2_r],    16      \n\t"
           "srl      %[p1_r],    %[p1_r],    16      \n\t"
           "srl      %[p0_r],    %[p0_r],    16      \n\t"
@@ -183,15 +165,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
-            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_r],    +1(%[sp2])    \n\t"
             "sb     %[p1_r],    +1(%[sp1])    \n\t"
             "sb     %[p0_r],    +1(%[sp0])    \n\t"
@@ -200,41 +181,36 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
             "sb     %[q2_r],    +1(%[sq2])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   +1(%[sp1])    \n\t"
             "sb     %[p0_f0],   +1(%[sp0])    \n\t"
             "sb     %[q0_f0],   +1(%[sq0])    \n\t"
             "sb     %[q1_f0],   +1(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 
-          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
-            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_l],    +2(%[sp2])    \n\t"
             "sb     %[p1_l],    +2(%[sp1])    \n\t"
             "sb     %[p0_l],    +2(%[sp0])    \n\t"
@@ -243,27 +219,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
             "sb     %[q2_l],    +2(%[sq2])    \n\t"
 
             :
-            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   +2(%[sp1])    \n\t"
             "sb     %[p0_f0],   +2(%[sp0])    \n\t"
             "sb     %[q0_f0],   +2(%[sq0])    \n\t"
             "sb     %[q1_f0],   +2(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_l],    %[p2_l],    16      \n\t"
           "srl      %[p1_l],    %[p1_l],    16      \n\t"
           "srl      %[p0_l],    %[p0_l],    16      \n\t"
@@ -275,15 +248,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
-            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_l],    +3(%[sp2])    \n\t"
             "sb     %[p1_l],    +3(%[sp1])    \n\t"
             "sb     %[p0_l],    +3(%[sp0])    \n\t"
@@ -292,24 +264,21 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
             "sb     %[q2_l],    +3(%[sq2])    \n\t"
 
             :
-            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   +3(%[sp1])    \n\t"
             "sb     %[p0_f0],   +3(%[sp0])    \n\t"
             "sb     %[q0_f0],   +3(%[sq0])    \n\t"
             "sb     %[q1_f0],   +3(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
     }
 
@@ -317,36 +286,33 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
   }
 }
 
-void vpx_lpf_vertical_8_dspr2(unsigned char *s,
-                              int pitch,
-                              const uint8_t *blimit,
-                              const uint8_t *limit,
+void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
+                              const uint8_t *blimit, const uint8_t *limit,
                               const uint8_t *thresh) {
-  uint8_t   i;
-  uint32_t  mask, hev, flat;
-  uint8_t   *s1, *s2, *s3, *s4;
-  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t  thresh_vec, flimit_vec, limit_vec;
-  uint32_t  uflimit, ulimit, uthresh;
-  uint32_t  p3, p2, p1, p0, q3, q2, q1, q0;
-  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
-  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+  uint8_t i;
+  uint32_t mask, hev, flat;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
 
   uflimit = *blimit;
-  ulimit  = *limit;
+  ulimit = *limit;
   uthresh = *thresh;
 
   /* create quad-byte */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
       "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
       "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
 
-      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
-        [limit_vec] "=r" (limit_vec)
-      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
-  );
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 
   prefetch_store(s + pitch);
 
@@ -355,9 +321,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
     s2 = s + pitch;
     s3 = s2 + pitch;
     s4 = s3 + pitch;
-    s  = s4 + pitch;
+    s = s4 + pitch;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lw     %[p0],  -4(%[s1])    \n\t"
         "lw     %[p1],  -4(%[s2])    \n\t"
         "lw     %[p2],  -4(%[s3])    \n\t"
@@ -367,10 +333,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
         "lw     %[q1],    (%[s3])    \n\t"
         "lw     %[q0],    (%[s4])    \n\t"
 
-        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
-          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
-        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
-    );
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 
     /* transpose p3, p2, p1, p0
        original (when loaded from memory)
@@ -387,7 +352,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
          p2         p3_1  p2_1  p1_1  p0_1
          p3         p3_0  p2_0  p1_0  p0_0
     */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
         "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
         "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
@@ -403,12 +368,10 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
         "append         %[p1],      %[sec3],    16          \n\t"
         "append         %[p3],      %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
     /* transpose q0, q1, q2, q3
        original (when loaded from memory)
@@ -425,7 +388,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
          q1         q0_1  q1_1  q2_1  q3_1
          q0         q0_0  q1_0  q2_0  q3_0
     */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
         "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
         "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
@@ -441,49 +404,40 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
         "append         %[q2],      %[sec3],    16          \n\t"
         "append         %[q0],      %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                    p1, p0, p3, p2, q0, q1, q2, q3,
-                                    &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat != 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_r],  -3(%[s4])    \n\t"
             "sb         %[p1_r],  -2(%[s4])    \n\t"
             "sb         %[p0_r],  -1(%[s4])    \n\t"
@@ -492,25 +446,22 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
             "sb         %[q2_r],  +2(%[s4])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [s4] "r" (s4)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s4] "r"(s4));
       } else if (mask & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  -2(%[s4])    \n\t"
             "sb         %[p0_f0],  -1(%[s4])    \n\t"
             "sb         %[q0_f0],    (%[s4])    \n\t"
             "sb         %[q1_f0],  +1(%[s4])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s4] "r" (s4)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_r],    %[p2_r],    16      \n\t"
           "srl      %[p1_r],    %[p1_r],    16      \n\t"
           "srl      %[p0_r],    %[p0_r],    16      \n\t"
@@ -522,15 +473,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
-            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_r],  -3(%[s3])    \n\t"
             "sb         %[p1_r],  -2(%[s3])    \n\t"
             "sb         %[p0_r],  -1(%[s3])    \n\t"
@@ -539,66 +489,58 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
             "sb         %[q2_r],  +2(%[s3])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [s3] "r" (s3)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s3] "r"(s3));
       } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  -2(%[s3])    \n\t"
             "sb         %[p0_f0],  -1(%[s3])    \n\t"
             "sb         %[q0_f0],    (%[s3])    \n\t"
             "sb         %[q1_f0],  +1(%[s3])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s3] "r" (s3)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 
-          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
-            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__ (
-          "sb         %[p2_l],  -3(%[s2])    \n\t"
-          "sb         %[p1_l],  -2(%[s2])    \n\t"
-          "sb         %[p0_l],  -1(%[s2])    \n\t"
-          "sb         %[q0_l],    (%[s2])    \n\t"
-          "sb         %[q1_l],  +1(%[s2])    \n\t"
-          "sb         %[q2_l],  +2(%[s2])    \n\t"
+        __asm__ __volatile__(
+            "sb         %[p2_l],  -3(%[s2])    \n\t"
+            "sb         %[p1_l],  -2(%[s2])    \n\t"
+            "sb         %[p0_l],  -1(%[s2])    \n\t"
+            "sb         %[q0_l],    (%[s2])    \n\t"
+            "sb         %[q1_l],  +1(%[s2])    \n\t"
+            "sb         %[q2_l],  +2(%[s2])    \n\t"
 
-          :
-          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-            [s2] "r" (s2)
-        );
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s2] "r"(s2));
       } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  -2(%[s2])    \n\t"
             "sb         %[p0_f0],  -1(%[s2])    \n\t"
             "sb         %[q0_f0],    (%[s2])    \n\t"
             "sb         %[q1_f0],  +1(%[s2])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s2] "r" (s2)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_l],    %[p2_l],    16      \n\t"
           "srl      %[p1_l],    %[p1_l],    16      \n\t"
           "srl      %[p0_l],    %[p0_l],    16      \n\t"
@@ -610,15 +552,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
-            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_l],  -3(%[s1])    \n\t"
             "sb         %[p1_l],  -2(%[s1])    \n\t"
             "sb         %[p0_l],  -1(%[s1])    \n\t"
@@ -627,21 +568,19 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
             "sb         %[q2_l],  +2(%[s1])    \n\t"
 
             :
-            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [s1] "r" (s1)
-        );
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s1] "r"(s1));
       } else if (mask & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  -2(%[s1])    \n\t"
             "sb         %[p0_f0],  -1(%[s1])    \n\t"
             "sb         %[q0_f0],    (%[s1])    \n\t"
             "sb         %[q1_f0],  +1(%[s1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
-              [q1_f0] "r" (q1_f0), [s1] "r" (s1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
       }
     }
   }
diff --git a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 85e167ca054fd1a374a0f5ef27be64903ba6fa96..6325762a2aa26b117a9e0501680c42a5814e7a78 100644
--- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -19,42 +19,38 @@
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-static void mb_lpf_horizontal_edge(unsigned char *s,
-                                   int pitch,
-                                   const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh,
-                                   int count) {
-  uint32_t  mask;
-  uint32_t  hev, flat, flat2;
-  uint8_t   i;
-  uint8_t   *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
-  uint8_t   *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
-  uint32_t  thresh_vec, flimit_vec, limit_vec;
-  uint32_t  uflimit, ulimit, uthresh;
-  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
-  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
-  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
-  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
-  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
-  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int count) {
+  uint32_t mask;
+  uint32_t hev, flat, flat2;
+  uint8_t i;
+  uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+  uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
 
   uflimit = *blimit;
-  ulimit  = *limit;
+  ulimit = *limit;
   uthresh = *thresh;
 
   /* create quad-byte */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
       "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
       "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
 
-      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
-        [limit_vec] "=r" (limit_vec)
-      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
-  );
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 
   /* prefetch data for store */
   prefetch_store(s);
@@ -77,7 +73,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
     sq6 = sq5 + pitch;
     sq7 = sq6 + pitch;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lw     %[p7],      (%[sp7])            \n\t"
         "lw     %[p6],      (%[sp6])            \n\t"
         "lw     %[p5],      (%[sp5])            \n\t"
@@ -87,13 +83,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
         "lw     %[p1],      (%[sp1])            \n\t"
         "lw     %[p0],      (%[sp0])            \n\t"
 
-        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
-          [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
-        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
-    );
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+          [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lw     %[q0],      (%[sq0])            \n\t"
         "lw     %[q1],      (%[sq1])            \n\t"
         "lw     %[q2],      (%[sq2])            \n\t"
@@ -103,57 +98,50 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
         "lw     %[q6],      (%[sq6])            \n\t"
         "lw     %[q7],      (%[sq7])            \n\t"
 
-        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
-          [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
-        : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
-          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
-    );
+        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+        : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
+          [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
 
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                    p1, p0, p3, p2, q0, q1, q2, q3,
-                                    &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 
     flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "sw       %[p1_f0],   (%[sp1])            \n\t"
           "sw       %[p0_f0],   (%[sp0])            \n\t"
           "sw       %[q0_f0],   (%[sq0])            \n\t"
           "sw       %[q1_f0],   (%[sq1])            \n\t"
 
           :
-          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-            [sp1] "r" (sp1), [sp0] "r" (sp0),
-            [sq0] "r" (sq0), [sq1] "r" (sq1)
-      );
+          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+            [sq1] "r"(sq1));
     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
                (mask == 0xFFFFFFFF)) {
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                          &p3_l, &p2_l, &p1_l, &p0_l,
-                          &q0_l, &q1_l, &q2_l, &q3_l,
-                          &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                          &p3_r, &p2_r, &p1_r, &p0_r,
-                          &q0_r, &q1_r, &q2_r, &q3_r,
-                          &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
       COMBINE_LEFT_RIGHT_3TO6()
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "sw         %[p6], (%[sp6])    \n\t"
           "sw         %[p5], (%[sp5])    \n\t"
           "sw         %[p4], (%[sp4])    \n\t"
@@ -163,13 +151,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "sw         %[p0], (%[sp0])    \n\t"
 
           :
-          : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
-            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
-            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
-            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
-      );
+          : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
+            [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
+            [sp1] "r"(sp1), [sp0] "r"(sp0));
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "sw         %[q6], (%[sq6])    \n\t"
           "sw         %[q5], (%[sq5])    \n\t"
           "sw         %[q4], (%[sq4])    \n\t"
@@ -179,26 +166,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "sw         %[q0], (%[sq0])    \n\t"
 
           :
-          : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
-            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
-            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
-            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
-      );
+          : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+            [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
+            [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
+            [sq1] "r"(sq1), [sq0] "r"(sq0));
     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
       /* f1 */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "sw         %[p2], (%[sp2])    \n\t"
           "sw         %[p1], (%[sp1])    \n\t"
           "sw         %[p0], (%[sp0])    \n\t"
@@ -207,28 +191,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "sw         %[q2], (%[sq2])    \n\t"
 
           :
-          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
-            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
-            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-      );
+          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_r],  (%[sp2])    \n\t"
             "sb         %[p1_r],  (%[sp1])    \n\t"
             "sb         %[p0_r],  (%[sp0])    \n\t"
@@ -237,27 +216,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q2_r],  (%[sq2])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  (%[sp1])    \n\t"
             "sb         %[p0_f0],  (%[sp0])    \n\t"
             "sb         %[q0_f0],  (%[sq0])    \n\t"
             "sb         %[q1_f0],  (%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_r],    %[p2_r],    16      \n\t"
           "srl      %[p1_r],    %[p1_r],    16      \n\t"
           "srl      %[p0_r],    %[p0_r],    16      \n\t"
@@ -269,15 +245,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
-            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_r],  +1(%[sp2])    \n\t"
             "sb         %[p1_r],  +1(%[sp1])    \n\t"
             "sb         %[p0_r],  +1(%[sp0])    \n\t"
@@ -286,39 +261,35 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q2_r],  +1(%[sq2])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  +1(%[sp1])    \n\t"
             "sb         %[p0_f0],  +1(%[sp0])    \n\t"
             "sb         %[q0_f0],  +1(%[sq0])    \n\t"
             "sb         %[q1_f0],  +1(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 
-          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_l],  +2(%[sp2])    \n\t"
             "sb         %[p1_l],  +2(%[sp1])    \n\t"
             "sb         %[p0_l],  +2(%[sp0])    \n\t"
@@ -327,27 +298,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q2_l],  +2(%[sq2])    \n\t"
 
             :
-            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  +2(%[sp1])    \n\t"
             "sb         %[p0_f0],  +2(%[sp0])    \n\t"
             "sb         %[q0_f0],  +2(%[sq0])    \n\t"
             "sb         %[q1_f0],  +2(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_l],    %[p2_l],    16      \n\t"
           "srl      %[p1_l],    %[p1_l],    16      \n\t"
           "srl      %[p0_l],    %[p0_l],    16      \n\t"
@@ -359,15 +327,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
-            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_l],  +3(%[sp2])    \n\t"
             "sb         %[p1_l],  +3(%[sp1])    \n\t"
             "sb         %[p0_l],  +3(%[sp0])    \n\t"
@@ -376,61 +343,51 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q2_l],  +3(%[sq2])    \n\t"
 
             :
-            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
       } else if (mask & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  +3(%[sp1])    \n\t"
             "sb         %[p0_f0],  +3(%[sp0])    \n\t"
             "sb         %[q0_f0],  +3(%[sq0])    \n\t"
             "sb         %[q1_f0],  +3(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 + f2 */
       /* f0  function */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* f1  function */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                      q0_l, q1_l, q2_l, q3_l,
-                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                      q0_r, q1_r, q2_r, q3_r,
-                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       /* f2  function */
       PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                          &p3_l, &p2_l, &p1_l, &p0_l,
-                          &q0_l, &q1_l, &q2_l, &q3_l,
-                          &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                          &p3_r, &p2_r, &p1_r, &p0_r,
-                          &q0_r, &q1_r, &q2_r, &q3_r,
-                          &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p6_r],  (%[sp6])    \n\t"
             "sb         %[p5_r],  (%[sp5])    \n\t"
             "sb         %[p4_r],  (%[sp4])    \n\t"
@@ -440,14 +397,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[p0_r],  (%[sp0])    \n\t"
 
             :
-            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
-              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
-              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
-              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
-              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
-        );
-
-        __asm__ __volatile__ (
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
             "sb         %[q0_r],  (%[sq0])    \n\t"
             "sb         %[q1_r],  (%[sq1])    \n\t"
             "sb         %[q2_r],  (%[sq2])    \n\t"
@@ -457,15 +412,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q6_r],  (%[sq6])    \n\t"
 
             :
-            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
-              [q6_r] "r" (q6_r),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
-              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
-              [sq6] "r" (sq6)
-        );
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
       } else if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_r_f1],  (%[sp2])    \n\t"
             "sb         %[p1_r_f1],  (%[sp1])    \n\t"
             "sb         %[p0_r_f1],  (%[sp0])    \n\t"
@@ -474,27 +426,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q2_r_f1],  (%[sq2])    \n\t"
 
             :
-            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
-              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
-              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
       } else if (mask & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  (%[sp1])    \n\t"
             "sb         %[p0_f0],  (%[sp0])    \n\t"
             "sb         %[q0_f0],  (%[sq0])    \n\t"
             "sb         %[q1_f0],  (%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
-              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl        %[p6_r], %[p6_r], 16     \n\t"
           "srl        %[p5_r], %[p5_r], 16     \n\t"
           "srl        %[p4_r], %[p4_r], 16     \n\t"
@@ -510,15 +460,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "srl        %[q5_r], %[q5_r], 16     \n\t"
           "srl        %[q6_r], %[q6_r], 16     \n\t"
 
-          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
-            [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
-            [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
-            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
-            [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
-          :
-      );
+          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+            [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
+            [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
+            [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
+          :);
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
           "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
           "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
@@ -530,16 +479,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
           "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
 
-          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
-            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
-            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & flat2 & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p6_r],  +1(%[sp6])    \n\t"
             "sb         %[p5_r],  +1(%[sp5])    \n\t"
             "sb         %[p4_r],  +1(%[sp4])    \n\t"
@@ -549,14 +497,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[p0_r],  +1(%[sp0])    \n\t"
 
             :
-            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
-              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
-              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
-              [sp4] "r" (sp4), [sp3] "r" (sp3),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
-        );
-
-        __asm__ __volatile__ (
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
             "sb         %[q0_r],  +1(%[sq0])    \n\t"
             "sb         %[q1_r],  +1(%[sq1])    \n\t"
             "sb         %[q2_r],  +1(%[sq2])    \n\t"
@@ -566,14 +512,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q6_r],  +1(%[sq6])    \n\t"
 
             :
-            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
-              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
-              [sq2] "r" (sq2), [sq3] "r" (sq3),
-              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
-        );
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
       } else if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
             "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
             "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
@@ -582,39 +526,36 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
 
             :
-            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
-              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
-              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
       } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  +1(%[sp1])    \n\t"
             "sb         %[p0_f0],  +1(%[sp0])    \n\t"
             "sb         %[q0_f0],  +1(%[sq0])    \n\t"
             "sb         %[q1_f0],  +1(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
-              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl        %[p1_f0], %[p1_f0], 8     \n\t"
           "srl        %[p0_f0], %[p0_f0], 8     \n\t"
           "srl        %[q0_f0], %[q0_f0], 8     \n\t"
           "srl        %[q1_f0], %[q1_f0], 8     \n\t"
 
-          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & flat2 & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p6_l],  +2(%[sp6])    \n\t"
             "sb         %[p5_l],  +2(%[sp5])    \n\t"
             "sb         %[p4_l],  +2(%[sp4])    \n\t"
@@ -624,14 +565,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[p0_l],  +2(%[sp0])    \n\t"
 
             :
-            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
-              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
-              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
-              [sp4] "r" (sp4), [sp3] "r" (sp3),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
-        );
-
-        __asm__ __volatile__ (
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
             "sb         %[q0_l],  +2(%[sq0])    \n\t"
             "sb         %[q1_l],  +2(%[sq1])    \n\t"
             "sb         %[q2_l],  +2(%[sq2])    \n\t"
@@ -641,14 +580,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q6_l],  +2(%[sq6])    \n\t"
 
             :
-            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
-              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
-              [sq2] "r" (sq2), [sq3] "r" (sq3),
-              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
-        );
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
       } else if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
             "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
             "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
@@ -657,27 +594,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
 
             :
-            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
-              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
-              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
       } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  +2(%[sp1])    \n\t"
             "sb         %[p0_f0],  +2(%[sp0])    \n\t"
             "sb         %[q0_f0],  +2(%[sq0])    \n\t"
             "sb         %[q1_f0],  +2(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
-              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p6_l],    %[p6_l],    16   \n\t"
           "srl      %[p5_l],    %[p5_l],    16   \n\t"
           "srl      %[p4_l],    %[p4_l],    16   \n\t"
@@ -693,15 +628,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "srl      %[q5_l],    %[q5_l],    16   \n\t"
           "srl      %[q6_l],    %[q6_l],    16   \n\t"
 
-          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
-            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
-            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
-            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
-            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
-          :
-      );
+          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+          :);
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
           "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
           "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
@@ -713,16 +647,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
           "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
           "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
 
-          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
-            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
-            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & flat2 & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p6_l],    +3(%[sp6])    \n\t"
             "sb     %[p5_l],    +3(%[sp5])    \n\t"
             "sb     %[p4_l],    +3(%[sp4])    \n\t"
@@ -732,14 +665,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb     %[p0_l],    +3(%[sp0])    \n\t"
 
             :
-            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
-              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
-              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
-              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
-              [sp1] "r" (sp1), [sp0] "r" (sp0)
-        );
-
-        __asm__ __volatile__ (
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
             "sb     %[q0_l],    +3(%[sq0])    \n\t"
             "sb     %[q1_l],    +3(%[sq1])    \n\t"
             "sb     %[q2_l],    +3(%[sq2])    \n\t"
@@ -749,15 +680,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb     %[q6_l],    +3(%[sq6])    \n\t"
 
             :
-            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
-              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
-              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
-              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
-              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
-        );
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
+              [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
       } else if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
             "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
             "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
@@ -766,25 +694,22 @@ static void mb_lpf_horizontal_edge(unsigned char *s,
             "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
 
             :
-            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
-              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
-              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
-              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
-        );
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
       } else if (mask & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   +3(%[sp1])    \n\t"
             "sb     %[p0_f0],   +3(%[sp0])    \n\t"
             "sb     %[q0_f0],   +3(%[sq0])    \n\t"
             "sb     %[q1_f0],   +3(%[sq1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [sp1] "r" (sp1), [sp0] "r" (sp0),
-              [sq0] "r" (sq0), [sq1] "r" (sq1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
       }
     }
 
diff --git a/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
index e580f014e933aa845f51eabf78cebb11a348c860..96e8d8858a117bdc4c6f3b5ddac48e3cf0759ff9 100644
--- a/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -19,40 +19,36 @@
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vpx_lpf_vertical_16_dspr2(uint8_t *s,
-                               int pitch,
-                               const uint8_t *blimit,
-                               const uint8_t *limit,
-                               const uint8_t *thresh) {
-  uint8_t   i;
-  uint32_t  mask, hev, flat, flat2;
-  uint8_t   *s1, *s2, *s3, *s4;
-  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t  thresh_vec, flimit_vec, limit_vec;
-  uint32_t  uflimit, ulimit, uthresh;
-  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
-  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
-  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
-  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
-  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
-  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask, hev, flat, flat2;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
 
   uflimit = *blimit;
   ulimit = *limit;
   uthresh = *thresh;
 
   /* create quad-byte */
-  __asm__ __volatile__ (
+  __asm__ __volatile__(
       "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
       "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
       "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
 
-      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
-        [limit_vec] "=r" (limit_vec)
-      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
-  );
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
 
   prefetch_store(s + pitch);
 
@@ -61,9 +57,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
     s2 = s + pitch;
     s3 = s2 + pitch;
     s4 = s3 + pitch;
-    s  = s4 + pitch;
+    s = s4 + pitch;
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lw     %[p0],  -4(%[s1])    \n\t"
         "lw     %[p1],  -4(%[s2])    \n\t"
         "lw     %[p2],  -4(%[s3])    \n\t"
@@ -73,13 +69,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
         "lw     %[p6],  -8(%[s3])    \n\t"
         "lw     %[p7],  -8(%[s4])    \n\t"
 
-        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
-          [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
-          [p5] "=&r" (p5), [p4] "=&r" (p4)
-        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
-    );
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "lw     %[q3],  (%[s1])     \n\t"
         "lw     %[q2],  (%[s2])     \n\t"
         "lw     %[q1],  (%[s3])     \n\t"
@@ -89,11 +83,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
         "lw     %[q5],  +4(%[s3])   \n\t"
         "lw     %[q4],  +4(%[s4])   \n\t"
 
-        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
-          [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
-          [q5] "=&r" (q5), [q4] "=&r" (q4)
-        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
-    );
+        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
 
     /* transpose p3, p2, p1, p0
        original (when loaded from memory)
@@ -110,7 +102,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
          p2         p3_1  p2_1  p1_1  p0_1
          p3         p3_0  p2_0  p1_0  p0_0
     */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
         "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
         "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
@@ -126,12 +118,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
         "append         %[p1],      %[sec3],    16          \n\t"
         "append         %[p3],      %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
     /* transpose q0, q1, q2, q3
        original (when loaded from memory)
@@ -148,7 +138,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
          q1         q0_1  q1_1  q2_1  q3_1
          q0         q0_0  q1_0  q2_0  q3_0
     */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
         "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
         "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
@@ -164,12 +154,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
         "append         %[q2],      %[sec3],    16          \n\t"
         "append         %[q0],      %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
     /* transpose p7, p6, p5, p4
        original (when loaded from memory)
@@ -186,7 +174,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
          p6         p7_1  p6_1  p5_1  p4_1
          p7         p7_0  p6_0  p5_0  p4_0
     */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
         "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
         "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
@@ -202,12 +190,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
         "append         %[p5],      %[sec3],    16          \n\t"
         "append         %[p7],      %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
+          [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
     /* transpose q4, q5, q6, q7
        original (when loaded from memory)
@@ -224,7 +210,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
          q5         q4_1  q5_1  q26_1  q7_1
          q4         q4_0  q5_0  q26_0  q7_0
     */
-    __asm__ __volatile__ (
+    __asm__ __volatile__(
         "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
         "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
         "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
@@ -240,71 +226,60 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
         "append         %[q6],      %[sec3],    16          \n\t"
         "append         %[q4],      %[sec4],    16          \n\t"
 
-        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
-          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
-          [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
-          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
-        :
-    );
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
+          [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
 
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                    p1, p0, p3, p2, q0, q1, q2, q3,
-                                    &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
 
     flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
                (mask == 0xFFFFFFFF)) {
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                          &p3_l, &p2_l, &p1_l, &p0_l,
-                          &q0_l, &q1_l, &q2_l, &q3_l,
-                          &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                          &p3_r, &p2_r, &p1_r, &p0_r,
-                          &q0_r, &q1_r, &q2_r, &q3_r,
-                          &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
 
       STORE_F2()
     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
       /* f1 */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                     &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                     &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_r],    -3(%[s4])    \n\t"
             "sb     %[p1_r],    -2(%[s4])    \n\t"
             "sb     %[p0_r],    -1(%[s4])    \n\t"
@@ -313,25 +288,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q2_r],    +2(%[s4])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [s4] "r" (s4)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s4] "r"(s4));
       } else if (mask & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb         %[p1_f0],  -2(%[s4])    \n\t"
             "sb         %[p0_f0],  -1(%[s4])    \n\t"
             "sb         %[q0_f0],    (%[s4])    \n\t"
             "sb         %[q1_f0],  +1(%[s4])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s4] "r" (s4)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_r],    %[p2_r],    16      \n\t"
           "srl      %[p1_r],    %[p1_r],    16      \n\t"
           "srl      %[p0_r],    %[p0_r],    16      \n\t"
@@ -343,15 +315,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
-            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_r],    -3(%[s3])    \n\t"
             "sb     %[p1_r],    -2(%[s3])    \n\t"
             "sb     %[p0_r],    -1(%[s3])    \n\t"
@@ -360,64 +331,57 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q2_r],    +2(%[s3])    \n\t"
 
             :
-            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
-              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
-              [s3] "r" (s3)
-        );
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s3] "r"(s3));
       } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   -2(%[s3])    \n\t"
             "sb     %[p0_f0],   -1(%[s3])    \n\t"
             "sb     %[q0_f0],     (%[s3])    \n\t"
             "sb     %[q1_f0],   +1(%[s3])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s3] "r" (s3)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 
-          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__ (
-          "sb       %[p2_l],    -3(%[s2])    \n\t"
-          "sb       %[p1_l],    -2(%[s2])    \n\t"
-          "sb       %[p0_l],    -1(%[s2])    \n\t"
-          "sb       %[q0_l],      (%[s2])    \n\t"
-          "sb       %[q1_l],    +1(%[s2])    \n\t"
-          "sb       %[q2_l],    +2(%[s2])    \n\t"
-
-          :
-          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-            [s2] "r" (s2)
-        );
+        __asm__ __volatile__(
+            "sb       %[p2_l],    -3(%[s2])    \n\t"
+            "sb       %[p1_l],    -2(%[s2])    \n\t"
+            "sb       %[p0_l],    -1(%[s2])    \n\t"
+            "sb       %[q0_l],      (%[s2])    \n\t"
+            "sb       %[q1_l],    +1(%[s2])    \n\t"
+            "sb       %[q2_l],    +2(%[s2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s2] "r"(s2));
       } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   -2(%[s2])    \n\t"
             "sb     %[p0_f0],   -1(%[s2])    \n\t"
             "sb     %[q0_f0],     (%[s2])    \n\t"
             "sb     %[q1_f0],   +1(%[s2])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s2] "r" (s2)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_l],    %[p2_l],    16      \n\t"
           "srl      %[p1_l],    %[p1_l],    16      \n\t"
           "srl      %[p0_l],    %[p0_l],    16      \n\t"
@@ -429,15 +393,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
 
-          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
-            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_l],    -3(%[s1])    \n\t"
             "sb     %[p1_l],    -2(%[s1])    \n\t"
             "sb     %[p0_l],    -1(%[s1])    \n\t"
@@ -446,54 +409,44 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q2_l],    +2(%[s1])    \n\t"
 
             :
-            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
-              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [s1] "r" (s1)
-        );
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s1] "r"(s1));
       } else if (mask & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   -2(%[s1])    \n\t"
             "sb     %[p0_f0],   -1(%[s1])    \n\t"
             "sb     %[q0_f0],     (%[s1])    \n\t"
             "sb     %[q1_f0],   +1(%[s1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s1] "r" (s1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
       }
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1+f2 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       PACK_LEFT_0TO3()
-      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                      q0_l, q1_l, q2_l, q3_l,
-                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       PACK_RIGHT_0TO3()
-      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                      q0_r, q1_r, q2_r, q3_r,
-                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                          &p3_l, &p2_l, &p1_l, &p0_l,
-                          &q0_l, &q1_l, &q2_l, &q3_l,
-                          &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                          &p3_r, &p2_r, &p1_r, &p0_r,
-                          &q0_r, &q1_r, &q2_r, &q3_r,
-                          &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p6_r],    -7(%[s4])    \n\t"
             "sb     %[p5_r],    -6(%[s4])    \n\t"
             "sb     %[p4_r],    -5(%[s4])    \n\t"
@@ -503,13 +456,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[p0_r],    -1(%[s4])    \n\t"
 
             :
-            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
-              [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
-              [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
-              [p0_r] "r" (p0_r), [s4] "r" (s4)
-        );
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [s4] "r"(s4));
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[q0_r],      (%[s4])    \n\t"
             "sb     %[q1_r],    +1(%[s4])    \n\t"
             "sb     %[q2_r],    +2(%[s4])    \n\t"
@@ -519,13 +470,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q6_r],    +6(%[s4])    \n\t"
 
             :
-            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
-              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
-              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
-              [q6_r] "r" (q6_r), [s4] "r" (s4)
-        );
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [s4] "r"(s4));
       } else if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
             "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
             "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
@@ -534,26 +483,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
 
             :
-            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
-              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
-              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
-              [s4] "r" (s4)
-        );
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
       } else if (mask & 0x000000FF) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   -2(%[s4])    \n\t"
             "sb     %[p0_f0],   -1(%[s4])    \n\t"
             "sb     %[q0_f0],     (%[s4])    \n\t"
             "sb     %[q1_f0],   +1(%[s4])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s4] "r" (s4)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p6_r],        %[p6_r],        16     \n\t"
           "srl      %[p5_r],        %[p5_r],        16     \n\t"
           "srl      %[p4_r],        %[p4_r],        16     \n\t"
@@ -569,17 +514,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
           "srl      %[q5_r],        %[q5_r],        16     \n\t"
           "srl      %[q6_r],        %[q6_r],        16     \n\t"
 
-          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
-            [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
-            [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
-            [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
-            [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
-            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
-            [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
-          :
-      );
-
-      __asm__ __volatile__ (
+          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+            [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
+            [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
+            [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
+          :);
+
+      __asm__ __volatile__(
           "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
           "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
           "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
@@ -591,16 +533,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
           "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
           "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
 
-          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
-            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
-            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & flat2 & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p6_r],    -7(%[s3])    \n\t"
             "sb     %[p5_r],    -6(%[s3])    \n\t"
             "sb     %[p4_r],    -5(%[s3])    \n\t"
@@ -610,12 +551,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[p0_r],    -1(%[s3])    \n\t"
 
             :
-            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
-              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
-              [p0_r] "r" (p0_r), [s3] "r" (s3)
-        );
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [s3] "r"(s3));
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[q0_r],      (%[s3])    \n\t"
             "sb     %[q1_r],    +1(%[s3])    \n\t"
             "sb     %[q2_r],    +2(%[s3])    \n\t"
@@ -625,13 +565,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q6_r],    +6(%[s3])    \n\t"
 
             :
-            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
-              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
-              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
-              [q6_r] "r" (q6_r), [s3] "r" (s3)
-        );
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [s3] "r"(s3));
       } else if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
             "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
             "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
@@ -640,38 +578,33 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
 
             :
-            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
-              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
-              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
-              [s3] "r" (s3)
-        );
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
       } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   -2(%[s3])    \n\t"
             "sb     %[p0_f0],   -1(%[s3])    \n\t"
             "sb     %[q0_f0],     (%[s3])    \n\t"
             "sb     %[q1_f0],   +1(%[s3])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s3] "r" (s3)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
 
-          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & flat2 & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p6_l],    -7(%[s2])    \n\t"
             "sb     %[p5_l],    -6(%[s2])    \n\t"
             "sb     %[p4_l],    -5(%[s2])    \n\t"
@@ -681,12 +614,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[p0_l],    -1(%[s2])    \n\t"
 
             :
-            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
-              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
-              [p0_l] "r" (p0_l), [s2] "r" (s2)
-        );
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [s2] "r"(s2));
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[q0_l],      (%[s2])    \n\t"
             "sb     %[q1_l],    +1(%[s2])    \n\t"
             "sb     %[q2_l],    +2(%[s2])    \n\t"
@@ -696,12 +628,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q6_l],    +6(%[s2])    \n\t"
 
             :
-            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
-              [q6_l] "r" (q6_l), [s2] "r" (s2)
-        );
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [s2] "r"(s2));
       } else if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
             "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
             "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
@@ -710,26 +641,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
 
             :
-            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
-              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
-              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
-              [s2] "r" (s2)
-        );
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
       } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   -2(%[s2])    \n\t"
             "sb     %[p0_f0],   -1(%[s2])    \n\t"
             "sb     %[q0_f0],     (%[s2])    \n\t"
             "sb     %[q1_f0],   +1(%[s2])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s2] "r" (s2)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
       }
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p6_l],        %[p6_l],        16     \n\t"
           "srl      %[p5_l],        %[p5_l],        16     \n\t"
           "srl      %[p4_l],        %[p4_l],        16     \n\t"
@@ -745,15 +672,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
           "srl      %[q5_l],        %[q5_l],        16     \n\t"
           "srl      %[q6_l],        %[q6_l],        16     \n\t"
 
-          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
-            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
-            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
-            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
-            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
-          :
-      );
+          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+          :);
 
-      __asm__ __volatile__ (
+      __asm__ __volatile__(
           "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
           "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
           "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
@@ -765,16 +691,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
           "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
           "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
 
-          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
-            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
-            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
-            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
-            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
-          :
-      );
+          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
 
       if (mask & flat & flat2 & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p6_l],    -7(%[s1])    \n\t"
             "sb     %[p5_l],    -6(%[s1])    \n\t"
             "sb     %[p4_l],    -5(%[s1])    \n\t"
@@ -784,13 +709,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[p0_l],    -1(%[s1])    \n\t"
 
             :
-            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
-              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
-              [p0_l] "r" (p0_l),
-              [s1] "r" (s1)
-        );
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [s1] "r"(s1));
 
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[q0_l],     (%[s1])    \n\t"
             "sb     %[q1_l],    1(%[s1])    \n\t"
             "sb     %[q2_l],    2(%[s1])    \n\t"
@@ -800,13 +723,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q6_l],    6(%[s1])    \n\t"
 
             :
-            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
-              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
-              [q6_l] "r" (q6_l),
-              [s1] "r" (s1)
-        );
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [s1] "r"(s1));
       } else if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
             "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
             "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
@@ -815,23 +736,19 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s,
             "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
 
             :
-            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
-              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
-              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
-              [s1] "r" (s1)
-        );
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
       } else if (mask & 0xFF000000) {
-        __asm__ __volatile__ (
+        __asm__ __volatile__(
             "sb     %[p1_f0],   -2(%[s1])    \n\t"
             "sb     %[p0_f0],   -1(%[s1])    \n\t"
             "sb     %[q0_f0],     (%[s1])    \n\t"
             "sb     %[q1_f0],   +1(%[s1])    \n\t"
 
             :
-            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
-              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
-              [s1] "r" (s1)
-        );
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
       }
     }
   }
diff --git a/vpx_dsp/mips/loopfilter_msa.h b/vpx_dsp/mips/loopfilter_msa.h
index 9894701bf7b8fa18aa82beaaff8966f78772e508..d3c2bd4edb1230bc33f1874192233ef197006967 100644
--- a/vpx_dsp/mips/loopfilter_msa.h
+++ b/vpx_dsp/mips/loopfilter_msa.h
@@ -13,234 +13,238 @@
 
 #include "vpx_dsp/mips/macros_msa.h"
 
-#define VPX_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
-                           p1_out, p0_out, q0_out, q1_out) {             \
-  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
-  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
-  v8i16 q0_sub_p0_r, filt_r, cnst3h;                                     \
-                                                                         \
-  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
-  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
-  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
-  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
-                                                                         \
-  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
-  filt = filt & (v16i8)hev_in;                                           \
-  q0_sub_p0 = q0_m - p0_m;                                               \
-  filt_sign = __msa_clti_s_b(filt, 0);                                   \
-                                                                         \
-  cnst3h = __msa_ldi_h(3);                                               \
-  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
-  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
-  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
-  filt_r += q0_sub_p0_r;                                                 \
-  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
-                                                                         \
-  /* combine left and right part */                                      \
-  filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                    \
-                                                                         \
-  filt = filt & (v16i8)mask_in;                                          \
-  cnst4b = __msa_ldi_b(4);                                               \
-  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
-  filt1 >>= 3;                                                           \
-                                                                         \
-  cnst3b = __msa_ldi_b(3);                                               \
-  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
-  filt2 >>= 3;                                                           \
-                                                                         \
-  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
-  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
-  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
-  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
-                                                                         \
-  filt = __msa_srari_b(filt1, 1);                                        \
-  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
-  filt = filt & (v16i8)hev_in;                                           \
-                                                                         \
-  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
-  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
-  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
-  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
-}
+#define VPX_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+                           p1_out, p0_out, q0_out, q1_out)              \
+  {                                                                     \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+    filt = filt & (v16i8)hev_in;                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    /* combine left and right part */                                   \
+    filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                 \
+                                                                        \
+    filt = filt & (v16i8)mask_in;                                       \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+  }
 
-#define VPX_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
-                           p1_out, p0_out, q0_out, q1_out) {             \
-  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                    \
-  v16i8 filt, filt1, filt2, cnst4b, cnst3b;                              \
-  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;                \
-                                                                         \
-  p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                               \
-  p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                               \
-  q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                               \
-  q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                               \
-                                                                         \
-  filt = __msa_subs_s_b(p1_m, q1_m);                                     \
-                                                                         \
-  filt = filt & (v16i8)hev_in;                                           \
-                                                                         \
-  q0_sub_p0 = q0_m - p0_m;                                               \
-  filt_sign = __msa_clti_s_b(filt, 0);                                   \
-                                                                         \
-  cnst3h = __msa_ldi_h(3);                                               \
-  q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);               \
-  q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);       \
-  filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                         \
-  filt_r += q0_sub_p0_r;                                                 \
-  filt_r = __msa_sat_s_h(filt_r, 7);                                     \
-                                                                         \
-  q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);               \
-  q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);       \
-  filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                         \
-  filt_l += q0_sub_p0_l;                                                 \
-  filt_l = __msa_sat_s_h(filt_l, 7);                                     \
-                                                                         \
-  filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                    \
-  filt = filt & (v16i8)mask_in;                                          \
-                                                                         \
-  cnst4b = __msa_ldi_b(4);                                               \
-  filt1 = __msa_adds_s_b(filt, cnst4b);                                  \
-  filt1 >>= 3;                                                           \
-                                                                         \
-  cnst3b = __msa_ldi_b(3);                                               \
-  filt2 = __msa_adds_s_b(filt, cnst3b);                                  \
-  filt2 >>= 3;                                                           \
-                                                                         \
-  q0_m = __msa_subs_s_b(q0_m, filt1);                                    \
-  q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                              \
-  p0_m = __msa_adds_s_b(p0_m, filt2);                                    \
-  p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                              \
-                                                                         \
-  filt = __msa_srari_b(filt1, 1);                                        \
-  hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                            \
-  filt = filt & (v16i8)hev_in;                                           \
-                                                                         \
-  q1_m = __msa_subs_s_b(q1_m, filt);                                     \
-  q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                              \
-  p1_m = __msa_adds_s_b(p1_m, filt);                                     \
-  p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                              \
-}
+#define VPX_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+                           p1_out, p0_out, q0_out, q1_out)              \
+  {                                                                     \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
+    filt = filt & (v16i8)mask_in;                                       \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+  }
 
-#define VPX_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) {  \
-  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;         \
-  v16u8 zero_in = { 0 };                                                 \
-                                                                         \
-  tmp = __msa_ori_b(zero_in, 1);                                         \
-  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                            \
-  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                            \
-  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                            \
-  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                            \
-                                                                         \
-  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);                 \
-  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                       \
-  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);                 \
-  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                       \
-                                                                         \
-  flat_out = (tmp < (v16u8)flat_out);                                    \
-  flat_out = __msa_xori_b(flat_out, 0xff);                               \
-  flat_out = flat_out & (mask);                                          \
-}
+#define VPX_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
+  {                                                                   \
+    v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;    \
+    v16u8 zero_in = { 0 };                                            \
+                                                                      \
+    tmp = __msa_ori_b(zero_in, 1);                                    \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                       \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                       \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                       \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                       \
+                                                                      \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);            \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                  \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);            \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                  \
+                                                                      \
+    flat_out = (tmp < (v16u8)flat_out);                               \
+    flat_out = __msa_xori_b(flat_out, 0xff);                          \
+    flat_out = flat_out & (mask);                                     \
+  }
 
-#define VPX_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
-                  q5_in, q6_in, q7_in, flat_in, flat2_out) {        \
-  v16u8 tmp, zero_in = { 0 };                                       \
-  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;         \
-  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;         \
-                                                                    \
-  tmp = __msa_ori_b(zero_in, 1);                                    \
-  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                       \
-  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                       \
-  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                       \
-  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                       \
-  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                       \
-  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                       \
-  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                       \
-  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                       \
-                                                                    \
-  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);            \
-  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);              \
-  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                \
-  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);            \
-  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                \
-  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);            \
-  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                \
-                                                                    \
-  flat2_out = (tmp < (v16u8)flat2_out);                             \
-  flat2_out = __msa_xori_b(flat2_out, 0xff);                        \
-  flat2_out = flat2_out & flat_in;                                  \
-}
+#define VPX_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+                  q6_in, q7_in, flat_in, flat2_out)                       \
+  {                                                                       \
+    v16u8 tmp, zero_in = { 0 };                                           \
+    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
+    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
+                                                                          \
+    tmp = __msa_ori_b(zero_in, 1);                                        \
+    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
+    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
+    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
+    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                           \
+    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                           \
+    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                           \
+    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                           \
+    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                           \
+                                                                          \
+    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);                  \
+    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                    \
+    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                    \
+    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
+                                                                          \
+    flat2_out = (tmp < (v16u8)flat2_out);                                 \
+    flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
+    flat2_out = flat2_out & flat_in;                                      \
+  }
 
-#define VPX_FILTER8(p3_in, p2_in, p1_in, p0_in,                  \
-                    q0_in, q1_in, q2_in, q3_in,                  \
-                    p2_filt8_out, p1_filt8_out, p0_filt8_out,    \
-                    q0_filt8_out, q1_filt8_out, q2_filt8_out) {  \
-  v8u16 tmp0, tmp1, tmp2;                                        \
-                                                                 \
-  tmp2 = p2_in + p1_in + p0_in;                                  \
-  tmp0 = p3_in << 1;                                             \
-                                                                 \
-  tmp0 = tmp0 + tmp2 + q0_in;                                    \
-  tmp1 = tmp0 + p3_in + p2_in;                                   \
-  p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
-                                                                 \
-  tmp1 = tmp0 + p1_in + q1_in;                                   \
-  p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
-                                                                 \
-  tmp1 = q2_in + q1_in + q0_in;                                  \
-  tmp2 = tmp2 + tmp1;                                            \
-  tmp0 = tmp2 + (p0_in);                                         \
-  tmp0 = tmp0 + (p3_in);                                         \
-  p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);           \
-                                                                 \
-  tmp0 = q2_in + q3_in;                                          \
-  tmp0 = p0_in + tmp1 + tmp0;                                    \
-  tmp1 = q3_in + q3_in;                                          \
-  tmp1 = tmp1 + tmp0;                                            \
-  q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
-                                                                 \
-  tmp0 = tmp2 + q3_in;                                           \
-  tmp1 = tmp0 + q0_in;                                           \
-  q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
-                                                                 \
-  tmp1 = tmp0 - p2_in;                                           \
-  tmp0 = q1_in + q3_in;                                          \
-  tmp1 = tmp0 + tmp1;                                            \
-  q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);           \
-}
+#define VPX_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+                    q1_filt8_out, q2_filt8_out)                             \
+  {                                                                         \
+    v8u16 tmp0, tmp1, tmp2;                                                 \
+                                                                            \
+    tmp2 = p2_in + p1_in + p0_in;                                           \
+    tmp0 = p3_in << 1;                                                      \
+                                                                            \
+    tmp0 = tmp0 + tmp2 + q0_in;                                             \
+    tmp1 = tmp0 + p3_in + p2_in;                                            \
+    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+                                                                            \
+    tmp1 = tmp0 + p1_in + q1_in;                                            \
+    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+                                                                            \
+    tmp1 = q2_in + q1_in + q0_in;                                           \
+    tmp2 = tmp2 + tmp1;                                                     \
+    tmp0 = tmp2 + (p0_in);                                                  \
+    tmp0 = tmp0 + (p3_in);                                                  \
+    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3);                    \
+                                                                            \
+    tmp0 = q2_in + q3_in;                                                   \
+    tmp0 = p0_in + tmp1 + tmp0;                                             \
+    tmp1 = q3_in + q3_in;                                                   \
+    tmp1 = tmp1 + tmp0;                                                     \
+    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+                                                                            \
+    tmp0 = tmp2 + q3_in;                                                    \
+    tmp1 = tmp0 + q0_in;                                                    \
+    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+                                                                            \
+    tmp1 = tmp0 - p2_in;                                                    \
+    tmp0 = q1_in + q3_in;                                                   \
+    tmp1 = tmp0 + tmp1;                                                     \
+    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3);                    \
+  }
 
-#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
-                     q0_in, q1_in, q2_in, q3_in,                 \
-                     limit_in, b_limit_in, thresh_in,            \
-                     hev_out, mask_out, flat_out) {              \
-  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
-  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
-                                                                 \
-  /* absolute subtraction of pixel values */                     \
-  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
-  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
-  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
-  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
-  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
-  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
-  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
-  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
-                                                                 \
-  /* calculation of hev */                                       \
-  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
-  hev_out = thresh_in < (v16u8)flat_out;                         \
-                                                                 \
-  /* calculation of mask */                                      \
-  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
-  p1_asub_q1_m >>= 1;                                            \
-  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
-                                                                 \
-  mask_out = b_limit_in < p0_asub_q0_m;                          \
-  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
-  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
-  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
-  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
-  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
-                                                                 \
-  mask_out = limit_in < (v16u8)mask_out;                         \
-  mask_out = __msa_xori_b(mask_out, 0xff);                       \
-}
-#endif  /* VPX_DSP_LOOPFILTER_MSA_H_ */
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  {                                                                          \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
+                                                                             \
+    /* absolute subtraction of pixel values */                               \
+    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                             \
+    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                             \
+    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                             \
+    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                             \
+    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                             \
+    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                             \
+    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                             \
+    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                             \
+                                                                             \
+    /* calculation of hev */                                                 \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = thresh_in < (v16u8)flat_out;                                   \
+                                                                             \
+    /* calculation of mask */                                                \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m >>= 1;                                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);               \
+                                                                             \
+    mask_out = b_limit_in < p0_asub_q0_m;                                    \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                            \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);                        \
+                                                                             \
+    mask_out = limit_in < (v16u8)mask_out;                                   \
+    mask_out = __msa_xori_b(mask_out, 0xff);                                 \
+  }
+#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index ea59eafe92abb31a45cda5f7a74702b67cff6956..f498fbe9de248be4a7126f808d0ce51fcfcff8ed 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -38,194 +38,186 @@
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LH(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint16_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "lh  %[val_m],  %[psrc_m]  \n\t"              \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-
-#define LW(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint32_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "lw  %[val_m],  %[psrc_m]  \n\t"              \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
+#define LH(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint16_t val_m;                                       \
+                                                          \
+    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
+
+#define LW(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint32_t val_m;                                       \
+                                                          \
+    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
 
 #if (__mips == 64)
-#define LD(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint64_t val_m = 0;                               \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "ld  %[val_m],  %[psrc_m]  \n\t"              \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
+#define LD(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint64_t val_m = 0;                                   \
+                                                          \
+    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
 #else  // !(__mips == 64)
-#define LD(psrc) ({                                        \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
-  uint32_t val0_m, val1_m;                                 \
-  uint64_t val_m = 0;                                      \
+#define LD(psrc)                                            \
+  ({                                                        \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
+    uint32_t val0_m, val1_m;                                \
+    uint64_t val_m = 0;                                     \
+                                                            \
+    val0_m = LW(psrc_m);                                    \
+    val1_m = LW(psrc_m + 4);                                \
+                                                            \
+    val_m = (uint64_t)(val1_m);                             \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
+                                                            \
+    val_m;                                                  \
+  })
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint16_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+
+#define SW(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint32_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+
+#define SD(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint64_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+#else  // !(__mips_isa_rev >= 6)
+#define LH(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint16_t val_m;                                        \
                                                            \
-  val0_m = LW(psrc_m);                                     \
-  val1_m = LW(psrc_m + 4);                                 \
+    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
                                                            \
-  val_m = (uint64_t)(val1_m);                              \
-  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
-  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
                                                            \
-  val_m;                                                   \
-})
-#endif  // (__mips == 64)
+    val_m;                                                 \
+  })
 
-#define SH(val, pdst) {                 \
-  uint8_t *pdst_m = (uint8_t *)(pdst);  \
-  const uint16_t val_m = (val);         \
-                                        \
-  __asm__ __volatile__ (                \
-      "sh  %[val_m],  %[pdst_m]  \n\t"  \
-                                        \
-      : [pdst_m] "=m" (*pdst_m)         \
-      : [val_m] "r" (val_m)             \
-  );                                    \
-}
-
-#define SW(val, pdst) {                 \
-  uint8_t *pdst_m = (uint8_t *)(pdst);  \
-  const uint32_t val_m = (val);         \
-                                        \
-  __asm__ __volatile__ (                \
-      "sw  %[val_m],  %[pdst_m]  \n\t"  \
-                                        \
-      : [pdst_m] "=m" (*pdst_m)         \
-      : [val_m] "r" (val_m)             \
-  );                                    \
-}
-
-#define SD(val, pdst) {                 \
-  uint8_t *pdst_m = (uint8_t *)(pdst);  \
-  const uint64_t val_m = (val);         \
-                                        \
-  __asm__ __volatile__ (                \
-      "sd  %[val_m],  %[pdst_m]  \n\t"  \
-                                        \
-      : [pdst_m] "=m" (*pdst_m)         \
-      : [val_m] "r" (val_m)             \
-  );                                    \
-}
-#else  // !(__mips_isa_rev >= 6)
-#define LH(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint16_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "ulh  %[val_m],  %[psrc_m]  \n\t"             \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-
-#define LW(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint32_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "ulw  %[val_m],  %[psrc_m]  \n\t"             \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
+#define LW(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint32_t val_m;                                        \
+                                                           \
+    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
+                                                           \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
+                                                           \
+    val_m;                                                 \
+  })
 
 #if (__mips == 64)
-#define LD(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint64_t val_m = 0;                               \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "uld  %[val_m],  %[psrc_m]  \n\t"             \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-#else  // !(__mips == 64)
-#define LD(psrc) ({                                        \
-  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
-  uint32_t val0_m, val1_m;                                 \
-  uint64_t val_m = 0;                                      \
+#define LD(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint64_t val_m = 0;                                    \
                                                            \
-  val0_m = LW(psrc_m1);                                    \
-  val1_m = LW(psrc_m1 + 4);                                \
+    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
                                                            \
-  val_m = (uint64_t)(val1_m);                              \
-  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
-  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
                                                            \
-  val_m;                                                   \
-})
-#endif  // (__mips == 64)
-
-#define SH(val, pdst) {                  \
-  uint8_t *pdst_m = (uint8_t *)(pdst);   \
-  const uint16_t val_m = (val);          \
-                                         \
-  __asm__ __volatile__ (                 \
-      "ush  %[val_m],  %[pdst_m]  \n\t"  \
-                                         \
-      : [pdst_m] "=m" (*pdst_m)          \
-      : [val_m] "r" (val_m)              \
-  );                                     \
-}
-
-#define SW(val, pdst) {                  \
-  uint8_t *pdst_m = (uint8_t *)(pdst);   \
-  const uint32_t val_m = (val);          \
-                                         \
-  __asm__ __volatile__ (                 \
-      "usw  %[val_m],  %[pdst_m]  \n\t"  \
-                                         \
-      : [pdst_m] "=m" (*pdst_m)          \
-      : [val_m] "r" (val_m)              \
-  );                                     \
-}
-
-#define SD(val, pdst) {                                     \
-  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
-  uint32_t val0_m, val1_m;                                  \
+    val_m;                                                 \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                            \
+  ({                                                        \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
+    uint32_t val0_m, val1_m;                                \
+    uint64_t val_m = 0;                                     \
                                                             \
-  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
-  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+    val0_m = LW(psrc_m1);                                   \
+    val1_m = LW(psrc_m1 + 4);                               \
                                                             \
-  SW(val0_m, pdst_m1);                                      \
-  SW(val1_m, pdst_m1 + 4);                                  \
-}
+    val_m = (uint64_t)(val1_m);                             \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
+                                                            \
+    val_m;                                                  \
+  })
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                                      \
+  {                                                        \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
+    const uint16_t val_m = (val);                          \
+                                                           \
+    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
+                                                           \
+                         : [pdst_m] "=m"(*pdst_m)          \
+                         : [val_m] "r"(val_m));            \
+  }
+
+#define SW(val, pdst)                                      \
+  {                                                        \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
+    const uint32_t val_m = (val);                          \
+                                                           \
+    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
+                                                           \
+                         : [pdst_m] "=m"(*pdst_m)          \
+                         : [val_m] "r"(val_m));            \
+  }
+
+#define SD(val, pdst)                                        \
+  {                                                          \
+    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t val0_m, val1_m;                                 \
+                                                             \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                             \
+    SW(val0_m, pdst_m1);                                     \
+    SW(val1_m, pdst_m1 + 4);                                 \
+  }
 #endif  // (__mips_isa_rev >= 6)
 
 /* Description : Load 4 words with stride
@@ -236,12 +228,13 @@
                  Load word in 'out2' from (psrc + 2 * stride)
                  Load word in 'out3' from (psrc + 3 * stride)
 */
-#define LW4(psrc, stride, out0, out1, out2, out3) {  \
-  out0 = LW((psrc));                                 \
-  out1 = LW((psrc) + stride);                        \
-  out2 = LW((psrc) + 2 * stride);                    \
-  out3 = LW((psrc) + 3 * stride);                    \
-}
+#define LW4(psrc, stride, out0, out1, out2, out3) \
+  {                                               \
+    out0 = LW((psrc));                            \
+    out1 = LW((psrc) + stride);                   \
+    out2 = LW((psrc) + 2 * stride);               \
+    out3 = LW((psrc) + 3 * stride);               \
+  }
 
 /* Description : Load double words with stride
    Arguments   : Inputs  - psrc, stride
@@ -249,14 +242,16 @@
    Details     : Load double word in 'out0' from (psrc)
                  Load double word in 'out1' from (psrc + stride)
 */
-#define LD2(psrc, stride, out0, out1) {  \
-  out0 = LD((psrc));                     \
-  out1 = LD((psrc) + stride);            \
-}
-#define LD4(psrc, stride, out0, out1, out2, out3) {  \
-  LD2((psrc), stride, out0, out1);                   \
-  LD2((psrc) + 2 * stride, stride, out2, out3);      \
-}
+#define LD2(psrc, stride, out0, out1) \
+  {                                   \
+    out0 = LD((psrc));                \
+    out1 = LD((psrc) + stride);       \
+  }
+#define LD4(psrc, stride, out0, out1, out2, out3) \
+  {                                               \
+    LD2((psrc), stride, out0, out1);              \
+    LD2((psrc) + 2 * stride, stride, out2, out3); \
+  }
 
 /* Description : Store 4 words with stride
    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
@@ -265,12 +260,13 @@
                  Store word from 'in2' to (pdst + 2 * stride)
                  Store word from 'in3' to (pdst + 3 * stride)
 */
-#define SW4(in0, in1, in2, in3, pdst, stride) {  \
-  SW(in0, (pdst))                                \
-  SW(in1, (pdst) + stride);                      \
-  SW(in2, (pdst) + 2 * stride);                  \
-  SW(in3, (pdst) + 3 * stride);                  \
-}
+#define SW4(in0, in1, in2, in3, pdst, stride) \
+  {                                           \
+    SW(in0, (pdst))                           \
+    SW(in1, (pdst) + stride);                 \
+    SW(in2, (pdst) + 2 * stride);             \
+    SW(in3, (pdst) + 3 * stride);             \
+  }
 
 /* Description : Store 4 double words with stride
    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
@@ -279,12 +275,13 @@
                  Store double word from 'in2' to (pdst + 2 * stride)
                  Store double word from 'in3' to (pdst + 3 * stride)
 */
-#define SD4(in0, in1, in2, in3, pdst, stride) {  \
-  SD(in0, (pdst))                                \
-  SD(in1, (pdst) + stride);                      \
-  SD(in2, (pdst) + 2 * stride);                  \
-  SD(in3, (pdst) + 3 * stride);                  \
-}
+#define SD4(in0, in1, in2, in3, pdst, stride) \
+  {                                           \
+    SD(in0, (pdst))                           \
+    SD(in1, (pdst) + stride);                 \
+    SD(in2, (pdst) + 2 * stride);             \
+    SD(in3, (pdst) + 3 * stride);             \
+  }
 
 /* Description : Load vectors with 16 byte elements with stride
    Arguments   : Inputs  - psrc, stride
@@ -293,45 +290,50 @@
    Details     : Load 16 byte elements in 'out0' from (psrc)
                  Load 16 byte elements in 'out1' from (psrc + stride)
 */
-#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_B(RTYPE, (psrc));                     \
-  out1 = LD_B(RTYPE, (psrc) + stride);            \
-}
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_B(RTYPE, (psrc));                \
+    out1 = LD_B(RTYPE, (psrc) + stride);       \
+  }
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
-  LD_B2(RTYPE, (psrc), stride, out0, out1);             \
-  out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
-}
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+  {                                                  \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
+    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
+  }
 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
 
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
-  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
-  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
-}
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
-  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
-  out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
-}
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+  {                                                              \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
+    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
+  }
 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
 
-#define LD_B7(RTYPE, psrc, stride,                             \
-              out0, out1, out2, out3, out4, out5, out6) {      \
-  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
-  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
-}
+#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+  {                                                                          \
+    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
+    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
+  }
 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
 
-#define LD_B8(RTYPE, psrc, stride,                                    \
-              out0, out1, out2, out3, out4, out5, out6, out7) {       \
-  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
-  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
-}
+#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+              out7)                                                          \
+  {                                                                          \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+  }
 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 
@@ -341,33 +343,36 @@
    Details     : Load 8 halfword elements in 'out0' from (psrc)
                  Load 8 halfword elements in 'out1' from (psrc + stride)
 */
-#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_H(RTYPE, (psrc));                     \
-  out1 = LD_H(RTYPE, (psrc) + (stride));          \
-}
+#define LD_H2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_H(RTYPE, (psrc));                \
+    out1 = LD_H(RTYPE, (psrc) + (stride));     \
+  }
 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
-  LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
-  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
-}
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
 
-#define LD_H8(RTYPE, psrc, stride,                                    \
-              out0, out1, out2, out3, out4, out5, out6, out7) {       \
-  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
-  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
-}
+#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+              out7)                                                          \
+  {                                                                          \
+    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+  }
 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
 
-#define LD_H16(RTYPE, psrc, stride,                                     \
-               out0, out1, out2, out3, out4, out5, out6, out7,          \
-               out8, out9, out10, out11, out12, out13, out14, out15) {  \
-  LD_H8(RTYPE, (psrc), stride,                                          \
-        out0, out1, out2, out3, out4, out5, out6, out7);                \
-  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
-        out8, out9, out10, out11, out12, out13, out14, out15);          \
-}
+#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
+               out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
+  {                                                                            \
+    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
+          out7);                                                               \
+    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+          out13, out14, out15);                                                \
+  }
 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
 
 /* Description : Load 4x4 block of signed halfword elements from 1D source
@@ -375,45 +380,49 @@
    Arguments   : Input   - psrc
                  Outputs - out0, out1, out2, out3
 */
-#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
-  out0 = LD_SH(psrc);                                    \
-  out2 = LD_SH(psrc + 8);                                \
-  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
-  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
-}
+#define LD4x4_SH(psrc, out0, out1, out2, out3)            \
+  {                                                       \
+    out0 = LD_SH(psrc);                                   \
+    out2 = LD_SH(psrc + 8);                               \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+  }
 
 /* Description : Load 2 vectors of signed word elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
                  Return Type - signed word
 */
-#define LD_SW2(psrc, stride, out0, out1) {  \
-  out0 = LD_SW((psrc));                     \
-  out1 = LD_SW((psrc) + stride);            \
-}
+#define LD_SW2(psrc, stride, out0, out1) \
+  {                                      \
+    out0 = LD_SW((psrc));                \
+    out1 = LD_SW((psrc) + stride);       \
+  }
 
 /* Description : Store vectors of 16 byte elements with stride
    Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 16 byte elements from 'in0' to (pdst)
                  Store 16 byte elements from 'in1' to (pdst + stride)
 */
-#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
-  ST_B(RTYPE, in0, (pdst));                     \
-  ST_B(RTYPE, in1, (pdst) + stride);            \
-}
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_B(RTYPE, in0, (pdst));                \
+    ST_B(RTYPE, in1, (pdst) + stride);       \
+  }
 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
 
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
-  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
-  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
-}
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
 
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
-              pdst, stride) {                                     \
-  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
-  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
-}
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+  {                                                                        \
+    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
+    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
+  }
 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
 
 /* Description : Store vectors of 8 halfword elements with stride
@@ -421,22 +430,25 @@
    Details     : Store 8 halfword elements from 'in0' to (pdst)
                  Store 8 halfword elements from 'in1' to (pdst + stride)
 */
-#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
-  ST_H(RTYPE, in0, (pdst));                     \
-  ST_H(RTYPE, in1, (pdst) + stride);            \
-}
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_H(RTYPE, in0, (pdst));                \
+    ST_H(RTYPE, in1, (pdst) + stride);       \
+  }
 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
-  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
-  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
-}
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
 
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
-  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
-  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
-}
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+  {                                                                        \
+    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
+    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
+  }
 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
 
 /* Description : Store vectors of word elements with stride
@@ -444,10 +456,11 @@
    Details     : Store 4 word elements from 'in0' to (pdst)
                  Store 4 word elements from 'in1' to (pdst + stride)
 */
-#define ST_SW2(in0, in1, pdst, stride) {  \
-  ST_SW(in0, (pdst));                     \
-  ST_SW(in1, (pdst) + stride);            \
-}
+#define ST_SW2(in0, in1, pdst, stride) \
+  {                                    \
+    ST_SW(in0, (pdst));                \
+    ST_SW(in1, (pdst) + stride);       \
+  }
 
 /* Description : Store 2x4 byte block to destination memory from input vector
    Arguments   : Inputs - in, stidx, pdst, stride
@@ -460,20 +473,21 @@
                  Index 'stidx+3' halfword element from 'in' vector is copied to
                  the GP register and stored to (pdst + 3 * stride)
 */
-#define ST2x4_UB(in, stidx, pdst, stride) {         \
-  uint16_t out0_m, out1_m, out2_m, out3_m;          \
-  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
-                                                    \
-  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
-  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
-  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
-  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
-                                                    \
-  SH(out0_m, pblk_2x4_m);                           \
-  SH(out1_m, pblk_2x4_m + stride);                  \
-  SH(out2_m, pblk_2x4_m + 2 * stride);              \
-  SH(out3_m, pblk_2x4_m + 3 * stride);              \
-}
+#define ST2x4_UB(in, stidx, pdst, stride)            \
+  {                                                  \
+    uint16_t out0_m, out1_m, out2_m, out3_m;         \
+    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
+                                                     \
+    out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
+    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+                                                     \
+    SH(out0_m, pblk_2x4_m);                          \
+    SH(out1_m, pblk_2x4_m + stride);                 \
+    SH(out2_m, pblk_2x4_m + 2 * stride);             \
+    SH(out3_m, pblk_2x4_m + 3 * stride);             \
+  }
 
 /* Description : Store 4x2 byte block to destination memory from input vector
    Arguments   : Inputs - in, pdst, stride
@@ -482,16 +496,17 @@
                  Index 1 word element from 'in' vector is copied to the GP
                  register and stored to (pdst + stride)
 */
-#define ST4x2_UB(in, pdst, stride) {        \
-  uint32_t out0_m, out1_m;                  \
-  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
-                                            \
-  out0_m = __msa_copy_u_w((v4i32)in, 0);    \
-  out1_m = __msa_copy_u_w((v4i32)in, 1);    \
-                                            \
-  SW(out0_m, pblk_4x2_m);                   \
-  SW(out1_m, pblk_4x2_m + stride);          \
-}
+#define ST4x2_UB(in, pdst, stride)           \
+  {                                          \
+    uint32_t out0_m, out1_m;                 \
+    uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+                                             \
+    out0_m = __msa_copy_u_w((v4i32)in, 0);   \
+    out1_m = __msa_copy_u_w((v4i32)in, 1);   \
+                                             \
+    SW(out0_m, pblk_4x2_m);                  \
+    SW(out1_m, pblk_4x2_m + stride);         \
+  }
 
 /* Description : Store 4x4 byte block to destination memory from input vector
    Arguments   : Inputs - in0, in1, pdst, stride
@@ -504,35 +519,38 @@
                  'Idx3' word element from input vector 'in0' is copied to the
                  GP register and stored to (pdst + 3 * stride)
 */
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
-  uint32_t out0_m, out1_m, out2_m, out3_m;                          \
-  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
-                                                                    \
-  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
-  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
-  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
-  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
-                                                                    \
-  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
-}
-#define ST4x8_UB(in0, in1, pdst, stride) {                        \
-  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
-                                                                  \
-  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
-  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
-}
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+  {                                                              \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                     \
+    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
+                                                                 \
+    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
+    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
+    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
+    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
+                                                                 \
+    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
+  }
+#define ST4x8_UB(in0, in1, pdst, stride)                           \
+  {                                                                \
+    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
+                                                                   \
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
+    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+  }
 
 /* Description : Store 8x1 byte block to destination memory from input vector
    Arguments   : Inputs - in, pdst
    Details     : Index 0 double word element from 'in' vector is copied to the
                  GP register and stored to (pdst)
 */
-#define ST8x1_UB(in, pdst) {              \
-  uint64_t out0_m;                        \
-                                          \
-  out0_m = __msa_copy_u_d((v2i64)in, 0);  \
-  SD(out0_m, pdst);                       \
-}
+#define ST8x1_UB(in, pdst)                 \
+  {                                        \
+    uint64_t out0_m;                       \
+                                           \
+    out0_m = __msa_copy_u_d((v2i64)in, 0); \
+    SD(out0_m, pdst);                      \
+  }
 
 /* Description : Store 8x2 byte block to destination memory from input vector
    Arguments   : Inputs - in, pdst, stride
@@ -541,16 +559,17 @@
                  Index 1 double word element from 'in' vector is copied to the
                  GP register and stored to (pdst + stride)
 */
-#define ST8x2_UB(in, pdst, stride) {        \
-  uint64_t out0_m, out1_m;                  \
-  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
-                                            \
-  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
-  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
-                                            \
-  SD(out0_m, pblk_8x2_m);                   \
-  SD(out1_m, pblk_8x2_m + stride);          \
-}
+#define ST8x2_UB(in, pdst, stride)           \
+  {                                          \
+    uint64_t out0_m, out1_m;                 \
+    uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+                                             \
+    out0_m = __msa_copy_u_d((v2i64)in, 0);   \
+    out1_m = __msa_copy_u_d((v2i64)in, 1);   \
+                                             \
+    SD(out0_m, pblk_8x2_m);                  \
+    SD(out1_m, pblk_8x2_m + stride);         \
+  }
 
 /* Description : Store 8x4 byte block to destination memory from input
                  vectors
@@ -564,17 +583,18 @@
                  Index 1 double word element from 'in1' vector is copied to the
                  GP register and stored to (pdst + 3 * stride)
 */
-#define ST8x4_UB(in0, in1, pdst, stride) {                  \
-  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
-  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
-                                                            \
-  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
-  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
-  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
-  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
-                                                            \
-  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
-}
+#define ST8x4_UB(in0, in1, pdst, stride)                     \
+  {                                                          \
+    uint64_t out0_m, out1_m, out2_m, out3_m;                 \
+    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
+                                                             \
+    out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
+    out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
+    out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
+    out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
+                                                             \
+    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+  }
 
 /* Description : average with rounding (in0 + in1 + 1) / 2.
    Arguments   : Inputs  - in0, in1, in2, in3,
@@ -584,17 +604,19 @@
                  each unsigned byte element from 'in1' vector. Then the average
                  with rounding is calculated and written to 'out0'
 */
-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
-  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
-}
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+  {                                                       \
+    out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
+    out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
+  }
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
-  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
-}
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
+    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
+  }
 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
 
 /* Description : Immediate number of elements to slide with zero
@@ -604,18 +626,20 @@
    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
                  value specified in the 'slide_val'
 */
-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
-  v16i8 zero_m = { 0 };                                              \
-  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
-  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
-}
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
+  {                                                                   \
+    v16i8 zero_m = { 0 };                                             \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+  }
 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
 
-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
-                  out0, out1, out2, out3, slide_val) {  \
-  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
-  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
-}
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
+                  slide_val)                                         \
+  {                                                                  \
+    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
+    SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
+  }
 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
 
 /* Description : Immediate number of elements to slide
@@ -625,18 +649,20 @@
    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
                  value specified in the 'slide_val'
 */
-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
-  out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
-  out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
-}
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+  {                                                                       \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
+  }
 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
 
-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
-                out0, out1, out2, slide_val) {                        \
-  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
-  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
-}
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
+                out2, slide_val)                                             \
+  {                                                                          \
+    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
+    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
+  }
 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
 
@@ -647,19 +673,21 @@
    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
                  'out0' as per control vector 'mask0'
 */
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
-  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
-  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
-}
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
+  {                                                                   \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+  }
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
 
-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
-                out0, out1, out2, out3) {                        \
-  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
-  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
-}
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
+                out3)                                                          \
+  {                                                                            \
+    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
+    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
+  }
 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
 
@@ -673,18 +701,19 @@
                  The multiplication result of adjacent odd-even elements
                  are added together and written to the 'out0' vector
 */
-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
-}
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
+  }
 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
 
-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
-                 cnst0, cnst1, cnst2, cnst3,                \
-                 out0, out1, out2, out3) {                  \
-  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
-  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
-}
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
 
 /* Description : Dot product of byte vector elements
@@ -697,17 +726,19 @@
                  The multiplication result of adjacent odd-even elements
                  are added together and written to the 'out0' vector
 */
-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
-}
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
+  }
 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
 
-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
-                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
-  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
-  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
-}
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
 
 /* Description : Dot product of halfword vector elements
@@ -720,18 +751,19 @@
                  The multiplication result of adjacent odd-even elements
                  are added together and written to the 'out0' vector
 */
-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
-}
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
+  }
 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
 
-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
-                 cnst0, cnst1, cnst2, cnst3,                \
-                 out0, out1, out2, out3) {                  \
-  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
-  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
-}
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
 
 /* Description : Dot product of word vector elements
@@ -744,10 +776,11 @@
                  The multiplication result of adjacent odd-even elements
                  are added together and written to the 'out0' vector
 */
-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
-}
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
+  }
 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
 
 /* Description : Dot product & addition of byte vector elements
@@ -760,17 +793,19 @@
                  The multiplication result of adjacent odd-even elements
                  are added to the 'out0' vector
 */
-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
-  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
-  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
-}
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+  }
 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
 
-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
-                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
-  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
-  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
-}
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                  cnst3, out0, out1, out2, out3)                          \
+  {                                                                       \
+    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
 
 /* Description : Dot product & addition of halfword vector elements
@@ -783,10 +818,11 @@
                  The multiplication result of adjacent odd-even elements
                  are added to the 'out0' vector
 */
-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
-  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
-  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
-}
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+  }
 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
 
 /* Description : Dot product & addition of double word vector elements
@@ -799,10 +835,11 @@
                  The multiplication result of adjacent odd-even elements
                  are added to the 'out0' vector
 */
-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
-  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
-  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
-}
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+  }
 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
 
 /* Description : Minimum values between unsigned elements of
@@ -813,16 +850,18 @@
    Details     : Minimum of unsigned halfword element values from 'in0' and
                  'min_vec' are written to output vector 'in0'
 */
-#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
-  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
-  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
-}
+#define MIN_UH2(RTYPE, in0, in1, min_vec)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
+    in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
+  }
 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
 
-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
-  MIN_UH2(RTYPE, in0, in1, min_vec);                   \
-  MIN_UH2(RTYPE, in2, in3, min_vec);                   \
-}
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
+  {                                                 \
+    MIN_UH2(RTYPE, in0, in1, min_vec);              \
+    MIN_UH2(RTYPE, in2, in3, min_vec);              \
+  }
 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
 
 /* Description : Clips all signed halfword elements of input vector
@@ -831,22 +870,25 @@
                  Output - out_m
                  Return Type - signed halfword
 */
-#define CLIP_SH_0_255(in) ({                          \
-  v8i16 max_m = __msa_ldi_h(255);                     \
-  v8i16 out_m;                                        \
-                                                      \
-  out_m = __msa_maxi_s_h((v8i16)in, 0);               \
-  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
-  out_m;                                              \
-})
-#define CLIP_SH2_0_255(in0, in1) {  \
-  in0 = CLIP_SH_0_255(in0);         \
-  in1 = CLIP_SH_0_255(in1);         \
-}
-#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
-  CLIP_SH2_0_255(in0, in1);                   \
-  CLIP_SH2_0_255(in2, in3);                   \
-}
+#define CLIP_SH_0_255(in)                              \
+  ({                                                   \
+    v8i16 max_m = __msa_ldi_h(255);                    \
+    v8i16 out_m;                                       \
+                                                       \
+    out_m = __msa_maxi_s_h((v8i16)in, 0);              \
+    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+    out_m;                                             \
+  })
+#define CLIP_SH2_0_255(in0, in1) \
+  {                              \
+    in0 = CLIP_SH_0_255(in0);    \
+    in1 = CLIP_SH_0_255(in1);    \
+  }
+#define CLIP_SH4_0_255(in0, in1, in2, in3) \
+  {                                        \
+    CLIP_SH2_0_255(in0, in1);              \
+    CLIP_SH2_0_255(in2, in3);              \
+  }
 
 /* Description : Horizontal addition of 4 signed word elements of input vector
    Arguments   : Input  - in       (signed word vector)
@@ -855,16 +897,17 @@
    Details     : 4 signed word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_SW_S32(in) ({                        \
-  v2i64 res0_m, res1_m;                           \
-  int32_t sum_m;                                  \
-                                                  \
-  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
-  res1_m = __msa_splati_d(res0_m, 1);             \
-  res0_m = res0_m + res1_m;                       \
-  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
-  sum_m;                                          \
-})
+#define HADD_SW_S32(in)                            \
+  ({                                               \
+    v2i64 res0_m, res1_m;                          \
+    int32_t sum_m;                                 \
+                                                   \
+    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+    res1_m = __msa_splati_d(res0_m, 1);            \
+    res0_m = res0_m + res1_m;                      \
+    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
+    sum_m;                                         \
+  })
 
 /* Description : Horizontal addition of 8 unsigned halfword elements
    Arguments   : Inputs  - in       (unsigned halfword vector)
@@ -873,18 +916,19 @@
    Details     : 8 unsigned halfword elements of input vector are added
                  together and the resulting integer sum is returned
 */
-#define HADD_UH_U32(in) ({                           \
-  v4u32 res_m;                                       \
-  v2u64 res0_m, res1_m;                              \
-  uint32_t sum_m;                                    \
-                                                     \
-  res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
-  res0_m = __msa_hadd_u_d(res_m, res_m);             \
-  res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
-  res0_m = res0_m + res1_m;                          \
-  sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
-  sum_m;                                             \
-})
+#define HADD_UH_U32(in)                               \
+  ({                                                  \
+    v4u32 res_m;                                      \
+    v2u64 res0_m, res1_m;                             \
+    uint32_t sum_m;                                   \
+                                                      \
+    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
+    res0_m = __msa_hadd_u_d(res_m, res_m);            \
+    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
+    res0_m = res0_m + res1_m;                         \
+    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
+    sum_m;                                            \
+  })
 
 /* Description : Horizontal addition of unsigned byte vector elements
    Arguments   : Inputs  - in0, in1
@@ -894,16 +938,18 @@
                  even unsigned byte element from 'in0' (pairwise) and the
                  halfword result is written to 'out0'
 */
-#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
-  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
-  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
-}
+#define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+  }
 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
 
-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
-  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
-}
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                 \
+    HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
+    HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
+  }
 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
 
 /* Description : Horizontal subtraction of unsigned byte vector elements
@@ -914,10 +960,11 @@
                  even unsigned byte element from 'in0' (pairwise) and the
                  halfword result is written to 'out0'
 */
-#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
-  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
-  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
-}
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+  }
 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
 
 /* Description : SAD (Sum of Absolute Difference)
@@ -928,18 +975,19 @@
                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
                  pairs are added together to generate 8 halfword results.
 */
-#define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
-  v16u8 diff0_m, diff1_m;                                   \
-  v8u16 sad_m = { 0 };                                      \
-                                                            \
-  diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
-  diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
-                                                            \
-  sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
-  sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
-                                                            \
-  sad_m;                                                    \
-})
+#define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
+  ({                                                         \
+    v16u8 diff0_m, diff1_m;                                  \
+    v8u16 sad_m = { 0 };                                     \
+                                                             \
+    diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
+    diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
+                                                             \
+    sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
+    sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
+                                                             \
+    sad_m;                                                   \
+  })
 
 /* Description : Horizontal subtraction of signed halfword vector elements
    Arguments   : Inputs  - in0, in1
@@ -949,10 +997,11 @@
                  even signed halfword element from 'in0' (pairwise) and the
                  word result is written to 'out0'
 */
-#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
-  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
-  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
-}
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+  }
 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
 
 /* Description : Set element n input vector to GPR value
@@ -961,25 +1010,28 @@
                  Return Type - as per RTYPE
    Details     : Set element 0 in vector 'out' to value specified in 'in0'
 */
-#define INSERT_W2(RTYPE, in0, in1, out) {           \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
-}
+#define INSERT_W2(RTYPE, in0, in1, out)              \
+  {                                                  \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+  }
 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
 
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
-}
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
+  {                                                  \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
+  }
 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
 
-#define INSERT_D2(RTYPE, in0, in1, out) {           \
-  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
-  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
-}
+#define INSERT_D2(RTYPE, in0, in1, out)              \
+  {                                                  \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+  }
 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
 
@@ -990,10 +1042,11 @@
    Details     : Even byte elements of 'in0' and 'in1' are interleaved
                  and written to 'out0'
 */
-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
-  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
-}
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+  }
 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
 
@@ -1004,10 +1057,11 @@
    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
                  and written to 'out0'
 */
-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
-  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
-}
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+  }
 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
@@ -1019,10 +1073,11 @@
    Details     : Even word elements of 'in0' and 'in1' are interleaved
                  and written to 'out0'
 */
-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
-  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
-}
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+  }
 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
 
 /* Description : Interleave even double word elements from vectors
@@ -1032,10 +1087,11 @@
    Details     : Even double word elements of 'in0' and 'in1' are interleaved
                  and written to 'out0'
 */
-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
-  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
-}
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+  }
 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
 
 /* Description : Interleave left half of byte elements from vectors
@@ -1045,20 +1101,22 @@
    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
                  and written to 'out0'.
 */
-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
-}
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+  }
 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
 
-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
@@ -1070,10 +1128,11 @@
    Details     : Left half of halfword elements of 'in0' and 'in1' are
                  interleaved and written to 'out0'.
 */
-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
-}
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+  }
 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
 
@@ -1084,10 +1143,11 @@
    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
                  and written to 'out0'.
 */
-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
-  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
-}
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+  }
 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
 
@@ -1098,33 +1158,36 @@
    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
                  and written to out0.
 */
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
-}
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+  }
 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
 
-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
 
-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
-                in8, in9, in10, in11, in12, in13, in14, in15,      \
-                out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
-          out0, out1, out2, out3);                                 \
-  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
-          out4, out5, out6, out7);                                 \
-}
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+                in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
+                out5, out6, out7)                                              \
+  {                                                                            \
+    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
+            out3);                                                             \
+    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
+            out6, out7);                                                       \
+  }
 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
 
 /* Description : Interleave right half of halfword elements from vectors
@@ -1134,32 +1197,36 @@
    Details     : Right half of halfword elements of 'in0' and 'in1' are
                  interleaved and written to 'out0'.
 */
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
-}
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+  }
 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
 
-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
 
-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
-  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
-}
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+  }
 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
 
-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
 
 /* Description : Interleave right half of double word elements from vectors
@@ -1169,25 +1236,28 @@
    Details     : Right half of double word elements of 'in0' and 'in1' are
                  interleaved and written to 'out0'.
 */
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
-  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
-  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
-}
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
+  {                                                         \
+    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+  }
 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
 
-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
-  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
-  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
-}
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
+  {                                                                    \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
+    out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
+  }
 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
 
-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
 
@@ -1198,26 +1268,29 @@
    Details     : Right half of byte elements from 'in0' and 'in1' are
                  interleaved and written to 'out0'
 */
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
-  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
-}
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+  }
 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
 
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
-  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
-}
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+  }
 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
 
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
-  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
-  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
-}
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+  }
 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
@@ -1232,16 +1305,18 @@
                  value generated with (sat_val + 1) bit range.
                  The results are written in place
 */
-#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
-  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
-  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
-}
+#define SAT_UH2(RTYPE, in0, in1, sat_val)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+  }
 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
 
-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
-  SAT_UH2(RTYPE, in0, in1, sat_val);                   \
-  SAT_UH2(RTYPE, in2, in3, sat_val)                    \
-}
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
+  {                                                 \
+    SAT_UH2(RTYPE, in0, in1, sat_val);              \
+    SAT_UH2(RTYPE, in2, in3, sat_val)               \
+  }
 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
 
 /* Description : Saturate the halfword element values to the max
@@ -1254,16 +1329,18 @@
                  value generated with (sat_val + 1) bit range
                  The results are written in place
 */
-#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
-  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
-  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
-}
+#define SAT_SH2(RTYPE, in0, in1, sat_val)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+  }
 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
 
-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
-  SAT_SH2(RTYPE, in0, in1, sat_val);                   \
-  SAT_SH2(RTYPE, in2, in3, sat_val);                   \
-}
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
+  {                                                 \
+    SAT_SH2(RTYPE, in0, in1, sat_val);              \
+    SAT_SH2(RTYPE, in2, in3, sat_val);              \
+  }
 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
 
 /* Description : Indexed halfword element values are replicated to all
@@ -1275,17 +1352,18 @@
                   elements in 'out0' vector
                   Valid index range for halfword operation is 0-7
 */
-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
-  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
-  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
-}
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
+  {                                                  \
+    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
+    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
+  }
 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
 
-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
-                  out0, out1, out2, out3) {           \
-  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
-  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
-}
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
+  {                                                                          \
+    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
+    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
+  }
 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
 
@@ -1297,19 +1375,21 @@
                  'out0' & even byte elements of 'in1' are copied to the right
                  half of 'out0'.
 */
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
-}
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+  }
 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
 
-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
@@ -1322,18 +1402,20 @@
                  'out0' & even halfword elements of 'in1' are copied to the
                  right half of 'out0'.
 */
-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
-}
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+  }
 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
 
-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
 
 /* Description : Pack even double word elements of vector pairs
@@ -1344,18 +1426,20 @@
                  'out0' & even double elements of 'in1' are copied to the right
                  half of 'out0'.
 */
-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
-  out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
-}
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
+    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
+  }
 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
 
-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
 
 /* Description : Each byte element is logically xor'ed with immediate 128
@@ -1365,30 +1449,34 @@
    Details     : Each unsigned byte element from input vector 'in0' is
                  logically xor'ed with 128 and the result is stored in-place.
 */
-#define XORI_B2_128(RTYPE, in0, in1) {         \
-  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
-  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
-}
+#define XORI_B2_128(RTYPE, in0, in1)            \
+  {                                             \
+    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+  }
 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
 
-#define XORI_B3_128(RTYPE, in0, in1, in2) {    \
-  XORI_B2_128(RTYPE, in0, in1);                \
-  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
-}
+#define XORI_B3_128(RTYPE, in0, in1, in2)       \
+  {                                             \
+    XORI_B2_128(RTYPE, in0, in1);               \
+    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+  }
 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
 
-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
-  XORI_B2_128(RTYPE, in0, in1);                   \
-  XORI_B2_128(RTYPE, in2, in3);                   \
-}
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
+  {                                            \
+    XORI_B2_128(RTYPE, in0, in1);              \
+    XORI_B2_128(RTYPE, in2, in3);              \
+  }
 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
 
-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
-  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
-  XORI_B3_128(RTYPE, in4, in5, in6);                             \
-}
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
+  {                                                           \
+    XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
+    XORI_B3_128(RTYPE, in4, in5, in6);                        \
+  }
 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
 
 /* Description : Average of signed halfword elements -> (a + b) / 2
@@ -1400,13 +1488,14 @@
                  in one extra bit in the result. The result is then divided by
                  2 and written to 'out0'
 */
-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
-  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
-  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
-  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
-}
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
+    out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
+    out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
+    out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
+  }
 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
 
 /* Description : Addition of signed halfword elements and signed saturation
@@ -1417,17 +1506,19 @@
                  halfword elements of 'in1'. The result is then signed saturated
                  between halfword data type range
 */
-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
-  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
-}
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+  {                                                       \
+    out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
+  }
 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
 
-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
 
 /* Description : Shift left all elements of vector (generic for all data types)
@@ -1437,12 +1528,13 @@
    Details     : Each element of vector 'in0' is left shifted by 'shift' and
                  the result is written in-place.
 */
-#define SLLI_4V(in0, in1, in2, in3, shift) {  \
-  in0 = in0 << shift;                         \
-  in1 = in1 << shift;                         \
-  in2 = in2 << shift;                         \
-  in3 = in3 << shift;                         \
-}
+#define SLLI_4V(in0, in1, in2, in3, shift) \
+  {                                        \
+    in0 = in0 << shift;                    \
+    in1 = in1 << shift;                    \
+    in2 = in2 << shift;                    \
+    in3 = in3 << shift;                    \
+  }
 
 /* Description : Arithmetic shift right all elements of vector
                  (generic for all data types)
@@ -1452,12 +1544,13 @@
    Details     : Each element of vector 'in0' is right shifted by 'shift' and
                  the result is written in-place. 'shift' is a GP variable.
 */
-#define SRA_4V(in0, in1, in2, in3, shift) {  \
-  in0 = in0 >> shift;                        \
-  in1 = in1 >> shift;                        \
-  in2 = in2 >> shift;                        \
-  in3 = in3 >> shift;                        \
-}
+#define SRA_4V(in0, in1, in2, in3, shift) \
+  {                                       \
+    in0 = in0 >> shift;                   \
+    in1 = in1 >> shift;                   \
+    in2 = in2 >> shift;                   \
+    in3 = in3 >> shift;                   \
+  }
 
 /* Description : Shift right arithmetic rounded words
    Arguments   : Inputs  - in0, in1, shift
@@ -1469,15 +1562,17 @@
                  rounding and the result is written in-place.
                  'shift' is a vector.
 */
-#define SRAR_W2(RTYPE, in0, in1, shift) {               \
-  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
-  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
-}
-
-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRAR_W2(RTYPE, in0, in1, shift)                    \
-  SRAR_W2(RTYPE, in2, in3, shift)                    \
-}
+#define SRAR_W2(RTYPE, in0, in1, shift)                  \
+  {                                                      \
+    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+  }
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                               \
+    SRAR_W2(RTYPE, in0, in1, shift)               \
+    SRAR_W2(RTYPE, in2, in3, shift)               \
+  }
 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
 
 /* Description : Shift right arithmetic rounded (immediate)
@@ -1489,30 +1584,34 @@
                  shifted value for rounding and the result is written in-place.
                  'shift' is an immediate value.
 */
-#define SRARI_H2(RTYPE, in0, in1, shift) {        \
-  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
-  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
-}
+#define SRARI_H2(RTYPE, in0, in1, shift)           \
+  {                                                \
+    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+  }
 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
 
-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRARI_H2(RTYPE, in0, in1, shift);                   \
-  SRARI_H2(RTYPE, in2, in3, shift);                   \
-}
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                                \
+    SRARI_H2(RTYPE, in0, in1, shift);              \
+    SRARI_H2(RTYPE, in2, in3, shift);              \
+  }
 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
 
-#define SRARI_W2(RTYPE, in0, in1, shift) {        \
-  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
-  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
-}
+#define SRARI_W2(RTYPE, in0, in1, shift)           \
+  {                                                \
+    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+  }
 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
 
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRARI_W2(RTYPE, in0, in1, shift);                   \
-  SRARI_W2(RTYPE, in2, in3, shift);                   \
-}
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                                \
+    SRARI_W2(RTYPE, in0, in1, shift);              \
+    SRARI_W2(RTYPE, in2, in3, shift);              \
+  }
 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
 
 /* Description : Logical shift right all elements of vector (immediate)
@@ -1522,12 +1621,13 @@
    Details     : Each element of vector 'in0' is right shifted by 'shift' and
                  the result is written in-place. 'shift' is an immediate value.
 */
-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
-  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
-  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
-  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
-  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
-}
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
+  {                                                                       \
+    out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
+    out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
+    out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
+    out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
+  }
 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
 
 /* Description : Multiplication of pairs of vectors
@@ -1536,15 +1636,16 @@
    Details     : Each element from 'in0' is multiplied with elements from 'in1'
                  and the result is written to 'out0'
 */
-#define MUL2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 * in1;                             \
-  out1 = in2 * in3;                             \
-}
-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
-  MUL2(in0, in1, in2, in3, out0, out1);               \
-  MUL2(in4, in5, in6, in7, out2, out3);               \
-}
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 * in1;                        \
+    out1 = in2 * in3;                        \
+  }
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    MUL2(in0, in1, in2, in3, out0, out1);                                    \
+    MUL2(in4, in5, in6, in7, out2, out3);                                    \
+  }
 
 /* Description : Addition of 2 pairs of vectors
    Arguments   : Inputs  - in0, in1, in2, in3
@@ -1552,15 +1653,16 @@
    Details     : Each element in 'in0' is added to 'in1' and result is written
                  to 'out0'.
 */
-#define ADD2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 + in1;                             \
-  out1 = in2 + in3;                             \
-}
-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
-  ADD2(in0, in1, in2, in3, out0, out1);               \
-  ADD2(in4, in5, in6, in7, out2, out3);               \
-}
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 + in1;                        \
+    out1 = in2 + in3;                        \
+  }
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    ADD2(in0, in1, in2, in3, out0, out1);                                    \
+    ADD2(in4, in5, in6, in7, out2, out3);                                    \
+  }
 
 /* Description : Subtraction of 2 pairs of vectors
    Arguments   : Inputs  - in0, in1, in2, in3
@@ -1568,17 +1670,18 @@
    Details     : Each element in 'in1' is subtracted from 'in0' and result is
                  written to 'out0'.
 */
-#define SUB2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 - in1;                             \
-  out1 = in2 - in3;                             \
-}
-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
-  out0 = in0 - in1;                                   \
-  out1 = in2 - in3;                                   \
-  out2 = in4 - in5;                                   \
-  out3 = in6 - in7;                                   \
-}
+#define SUB2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 - in1;                        \
+    out1 = in2 - in3;                        \
+  }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    out0 = in0 - in1;                                                        \
+    out1 = in2 - in3;                                                        \
+    out2 = in4 - in5;                                                        \
+    out3 = in6 - in7;                                                        \
+  }
 
 /* Description : Sign extend halfword elements from right half of the vector
    Arguments   : Input  - in    (halfword vector)
@@ -1588,12 +1691,13 @@
                  extracted and interleaved with same vector 'in0' to generate
                  4 word elements keeping sign intact
 */
-#define UNPCK_R_SH_SW(in, out) {                 \
-  v8i16 sign_m;                                  \
-                                                 \
-  sign_m = __msa_clti_s_h((v8i16)in, 0);         \
-  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
-}
+#define UNPCK_R_SH_SW(in, out)                    \
+  {                                               \
+    v8i16 sign_m;                                 \
+                                                  \
+    sign_m = __msa_clti_s_h((v8i16)in, 0);        \
+    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+  }
 
 /* Description : Zero extend unsigned byte elements to halfword elements
    Arguments   : Input   - in          (unsigned byte vector)
@@ -1602,11 +1706,12 @@
    Details     : Zero extended right half of vector is returned in 'out0'
                  Zero extended left half of vector is returned in 'out1'
 */
-#define UNPCK_UB_SH(in, out0, out1) {   \
-  v16i8 zero_m = { 0 };                 \
-                                        \
-  ILVRL_B2_SH(zero_m, in, out0, out1);  \
-}
+#define UNPCK_UB_SH(in, out0, out1)      \
+  {                                      \
+    v16i8 zero_m = { 0 };                \
+                                         \
+    ILVRL_B2_SH(zero_m, in, out0, out1); \
+  }
 
 /* Description : Sign extend halfword elements from input vector and return
                  the result in pair of vectors
@@ -1619,91 +1724,96 @@
                  Then interleaved left with same vector 'in0' to
                  generate 4 signed word elements in 'out1'
 */
-#define UNPCK_SH_SW(in, out0, out1) {    \
-  v8i16 tmp_m;                           \
-                                         \
-  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
-  ILVRL_H2_SW(tmp_m, in, out0, out1);    \
-}
+#define UNPCK_SH_SW(in, out0, out1)       \
+  {                                       \
+    v8i16 tmp_m;                          \
+                                          \
+    tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+    ILVRL_H2_SW(tmp_m, in, out0, out1);   \
+  }
 
 /* Description : Butterfly of 4 input vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1, out2, out3
    Details     : Butterfly operation
 */
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  out0 = in0 + in3;                                                \
-  out1 = in1 + in2;                                                \
-                                                                   \
-  out2 = in1 - in2;                                                \
-  out3 = in0 - in3;                                                \
-}
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                             \
+    out0 = in0 + in3;                                           \
+    out1 = in1 + in2;                                           \
+                                                                \
+    out2 = in1 - in2;                                           \
+    out3 = in0 - in3;                                           \
+  }
 
 /* Description : Butterfly of 8 input vectors
    Arguments   : Inputs  - in0 ...  in7
                  Outputs - out0 .. out7
    Details     : Butterfly operation
 */
-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                    out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  out0 = in0 + in7;                                                    \
-  out1 = in1 + in6;                                                    \
-  out2 = in2 + in5;                                                    \
-  out3 = in3 + in4;                                                    \
-                                                                       \
-  out4 = in3 - in4;                                                    \
-  out5 = in2 - in5;                                                    \
-  out6 = in1 - in6;                                                    \
-  out7 = in0 - in7;                                                    \
-}
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                    out3, out4, out5, out6, out7)                             \
+  {                                                                           \
+    out0 = in0 + in7;                                                         \
+    out1 = in1 + in6;                                                         \
+    out2 = in2 + in5;                                                         \
+    out3 = in3 + in4;                                                         \
+                                                                              \
+    out4 = in3 - in4;                                                         \
+    out5 = in2 - in5;                                                         \
+    out6 = in1 - in6;                                                         \
+    out7 = in0 - in7;                                                         \
+  }
 
 /* Description : Butterfly of 16 input vectors
    Arguments   : Inputs  - in0 ...  in15
                  Outputs - out0 .. out15
    Details     : Butterfly operation
 */
-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
-                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
-                     out0, out1, out2, out3, out4, out5, out6, out7,          \
-                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
-  out0 = in0 + in15;                                                          \
-  out1 = in1 + in14;                                                          \
-  out2 = in2 + in13;                                                          \
-  out3 = in3 + in12;                                                          \
-  out4 = in4 + in11;                                                          \
-  out5 = in5 + in10;                                                          \
-  out6 = in6 + in9;                                                           \
-  out7 = in7 + in8;                                                           \
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
+                     in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
+                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
+                     out13, out14, out15)                                     \
+  {                                                                           \
+    out0 = in0 + in15;                                                        \
+    out1 = in1 + in14;                                                        \
+    out2 = in2 + in13;                                                        \
+    out3 = in3 + in12;                                                        \
+    out4 = in4 + in11;                                                        \
+    out5 = in5 + in10;                                                        \
+    out6 = in6 + in9;                                                         \
+    out7 = in7 + in8;                                                         \
                                                                               \
-  out8 = in7 - in8;                                                           \
-  out9 = in6 - in9;                                                           \
-  out10 = in5 - in10;                                                         \
-  out11 = in4 - in11;                                                         \
-  out12 = in3 - in12;                                                         \
-  out13 = in2 - in13;                                                         \
-  out14 = in1 - in14;                                                         \
-  out15 = in0 - in15;                                                         \
-}
+    out8 = in7 - in8;                                                         \
+    out9 = in6 - in9;                                                         \
+    out10 = in5 - in10;                                                       \
+    out11 = in4 - in11;                                                       \
+    out12 = in3 - in12;                                                       \
+    out13 = in2 - in13;                                                       \
+    out14 = in1 - in14;                                                       \
+    out15 = in0 - in15;                                                       \
+  }
 
 /* Description : Transpose input 8x8 byte block
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - as per RTYPE
 */
-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
-                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
-                                                                           \
-  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
-             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
-  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
-  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
-  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
-  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
-  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
-  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
-}
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
+                        out1, out2, out3, out4, out5, out6, out7)              \
+  {                                                                            \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
+                                                                               \
+    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
+               tmp3_m);                                                        \
+    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
+    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
+    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
+    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
+    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
+    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
+  }
 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
 
 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
@@ -1712,128 +1822,133 @@
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - unsigned byte
 */
-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                            in8, in9, in10, in11, in12, in13, in14, in15,      \
-                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
-  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
-                                                                               \
-  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
-  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
-  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
-  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
-                                                                               \
-  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
-  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
-  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
-  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
-  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
-  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
-  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
-  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
-                                                                               \
-  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
-  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-                                                                               \
-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
-  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-                                                                               \
-  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
-  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-                                                                               \
-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
-  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-}
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+                            in10, in11, in12, in13, in14, in15, out0, out1,   \
+                            out2, out3, out4, out5, out6, out7)               \
+  {                                                                           \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
+                                                                              \
+    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
+    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
+    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
+    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
+                                                                              \
+    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
+    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
+    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
+    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
+    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
+    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
+    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
+    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
+                                                                              \
+    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
+    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
+    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
+    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
+    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+  }
 
 /* Description : Transpose 4x4 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1, out2, out3
                  Return Type - signed halfword
 */
-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  v8i16 s0_m, s1_m;                                                       \
-                                                                          \
-  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
-  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
-  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
-  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
-}
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v8i16 s0_m, s1_m;                                                  \
+                                                                       \
+    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
+    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
+  }
 
 /* Description : Transpose 4x8 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - signed halfword
 */
-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                           out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
-  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
-  v8i16 zero_m = { 0 };                                                       \
-                                                                              \
-  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
-             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
-  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
-  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
-                                                                              \
-  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
-  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
-  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
-  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
-                                                                              \
-  out4 = zero_m;                                                              \
-  out5 = zero_m;                                                              \
-  out6 = zero_m;                                                              \
-  out7 = zero_m;                                                              \
-}
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                           out2, out3, out4, out5, out6, out7)                 \
+  {                                                                            \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
+    v8i16 zero_m = { 0 };                                                      \
+                                                                               \
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
+               tmp3_n);                                                        \
+    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
+    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
+                                                                               \
+    out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
+    out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
+                                                                               \
+    out4 = zero_m;                                                             \
+    out5 = zero_m;                                                             \
+    out6 = zero_m;                                                             \
+    out7 = zero_m;                                                             \
+  }
 
 /* Description : Transpose 8x4 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - signed halfword
 */
-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                          \
-  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
-  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
-  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
-  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
-}
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
+                                                                       \
+    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
+    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
+    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
+    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
+  }
 
 /* Description : Transpose 8x8 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
                  Return Type - as per RTYPE
 */
-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
-                       out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v8i16 s0_m, s1_m;                                                       \
-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
-                                                                          \
-  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
-  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
-  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
-  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
-  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
-           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
-  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
-  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
-  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
-  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
-}
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+                       out1, out2, out3, out4, out5, out6, out7)            \
+  {                                                                         \
+    v8i16 s0_m, s1_m;                                                       \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
+                                                                            \
+    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
+    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
+    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
+    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
+    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
+             tmp7_m, out0, out2, out4, out6);                               \
+    out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
+    out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
+    out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
+    out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
+  }
 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
 
 /* Description : Transpose 4x4 block with word elements in vectors
@@ -1841,40 +1956,42 @@
                  Outputs - out0, out1, out2, out3
                  Return Type - signed word
 */
-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
-                                                                          \
-  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
-  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
-                                                                          \
-  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
-  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
-  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
-  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
-}
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
+                                                                       \
+    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
+    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
+                                                                       \
+    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
+    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
+    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
+    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
+  }
 
 /* Description : Add block 4x4
    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
    Details     : Least significant 4 bytes from each input vector are added to
                  the destination bytes, clipped between 0-255 and stored.
 */
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
-  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
-  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
-  v16i8 dst0_m = { 0 };                                         \
-  v16i8 dst1_m = { 0 };                                         \
-  v16i8 zero_m = { 0 };                                         \
-                                                                \
-  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
-  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
-  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
-  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
-  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
-  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
-  CLIP_SH2_0_255(res0_m, res1_m);                               \
-  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
-  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
-}
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
+  {                                                              \
+    uint32_t src0_m, src1_m, src2_m, src3_m;                     \
+    v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
+    v16i8 dst0_m = { 0 };                                        \
+    v16i8 dst1_m = { 0 };                                        \
+    v16i8 zero_m = { 0 };                                        \
+                                                                 \
+    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
+    LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
+    INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
+    INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
+    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
+    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
+    CLIP_SH2_0_255(res0_m, res1_m);                              \
+    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
+    ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
+  }
 
 /* Description : Pack even elements of input vectors & xor with 128
    Arguments   : Inputs - in0, in1
@@ -1884,53 +2001,57 @@
                  together in one vector and the resulting vector is xor'ed with
                  128 to shift the range from signed to unsigned byte
 */
-#define PCKEV_XORI128_UB(in0, in1) ({                    \
-  v16u8 out_m;                                           \
-                                                         \
-  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
-  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
-  out_m;                                                 \
-})
+#define PCKEV_XORI128_UB(in0, in1)                        \
+  ({                                                      \
+    v16u8 out_m;                                          \
+                                                          \
+    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
+    out_m;                                                \
+  })
 
 /* Description : Converts inputs to unsigned bytes, interleave, average & store
                  as 8x4 unsigned byte block
    Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
                           pdst, stride
 */
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
-                                dst0, dst1, dst2, dst3, pdst, stride) {  \
-  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
-  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
-                                                                         \
-  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
-  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
-  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
-  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
-  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
-}
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
+                                pdst, stride)                               \
+  {                                                                         \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                    \
+                                                                            \
+    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
+    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
+    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                               \
+  }
 
 /* Description : Pack even byte elements and store byte vector in destination
                  memory
    Arguments   : Inputs - in0, in1, pdst
 */
-#define PCKEV_ST_SB(in0, in1, pdst) {             \
-  v16i8 tmp_m;                                    \
-                                                  \
-  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
-  ST_SB(tmp_m, (pdst));                           \
-}
+#define PCKEV_ST_SB(in0, in1, pdst)                \
+  {                                                \
+    v16i8 tmp_m;                                   \
+                                                   \
+    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+    ST_SB(tmp_m, (pdst));                          \
+  }
 
 /* Description : Horizontal 2 tap filter kernel code
    Arguments   : Inputs - in0, in1, mask, coeff, shift
 */
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
-  v16i8 tmp0_m;                                                \
-  v8u16 tmp1_m;                                                \
-                                                               \
-  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
-  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
-  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
-                                                               \
-  tmp1_m;                                                      \
-})
-#endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
+  ({                                                            \
+    v16i8 tmp0_m;                                               \
+    v8u16 tmp1_m;                                               \
+                                                                \
+    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
+    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
+                                                                \
+    tmp1_m;                                                     \
+  })
+#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index 3bdec28e6ef058bc7d6f173d6f38736d29afd4c2..6455814e1b84d2fbeffefd7fbfd64187e1c4c6b2 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -11,12 +11,13 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
-#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) {    \
-  out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0);  \
-  out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1);  \
-  out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2);  \
-  out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3);  \
-}
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
+  {                                                        \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
+  }
 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
 
 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
@@ -58,8 +59,8 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
     ref += (4 * ref_stride);
 
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                src0, src1, ref0, ref1);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
   }
 
@@ -214,8 +215,8 @@ static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
     src += (4 * src_stride);
     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
     ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
-                src0, src1, ref0, ref1);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
+                ref0, ref1);
     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
 
     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
@@ -473,8 +474,8 @@ static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
     src += (4 * src_stride);
     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
     ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
-                src0, src1, ref0, ref1);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
+                ref0, ref1);
     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
 
     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
@@ -793,9 +794,9 @@ static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
 }
 
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t * const aref_ptr[],
-                               int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   int32_t ht_cnt;
   uint32_t src0, src1, src2, src3;
@@ -854,9 +855,9 @@ static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 }
 
 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t * const aref_ptr[],
-                               int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
   int32_t ht_cnt;
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   v16u8 src0, src1, src2, src3;
@@ -905,9 +906,9 @@ static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 }
 
 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t * const aref_ptr[],
-                                int32_t ref_stride,
-                                int32_t height, uint32_t *sad_array) {
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
   int32_t ht_cnt;
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   v16u8 src, ref0, ref1, ref2, ref3, diff;
@@ -970,9 +971,9 @@ static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
 }
 
 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t * const aref_ptr[],
-                                int32_t ref_stride,
-                                int32_t height, uint32_t *sad_array) {
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   int32_t ht_cnt;
   v16u8 src0, src1, ref0, ref1;
@@ -1014,9 +1015,9 @@ static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
 }
 
 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t * const aref_ptr[],
-                                int32_t ref_stride,
-                                int32_t height, uint32_t *sad_array) {
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
   int32_t ht_cnt;
   v16u8 src0, src1, src2, src3;
@@ -1114,8 +1115,8 @@ static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
     ref += (4 * ref_stride);
     LD_UB2(sec_pred, 16, pred0, pred1);
     sec_pred += 32;
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                src0, src1, ref0, ref1);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
     AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
     sad += SAD_UB2_UH(src0, src1, diff0, diff1);
   }
@@ -1213,8 +1214,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     ref += ref_stride;
     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
     sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
-                comp0, comp1, comp2, comp3);
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
 
@@ -1224,8 +1225,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     ref += ref_stride;
     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
     sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
-                comp0, comp1, comp2, comp3);
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
 
@@ -1235,8 +1236,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     ref += ref_stride;
     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
     sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
-                comp0, comp1, comp2, comp3);
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
 
@@ -1246,8 +1247,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     ref += ref_stride;
     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
     sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3,
-                comp0, comp1, comp2, comp3);
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
   }
@@ -1258,180 +1259,180 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
   return HADD_SW_S32(sad);
 }
 
-#define VPX_SAD_4xHEIGHT_MSA(height)                                        \
-uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,    \
-                                 const uint8_t *ref, int32_t ref_stride) {  \
-  return sad_4width_msa(src, src_stride,  ref, ref_stride, height);         \
-}
+#define VPX_SAD_4xHEIGHT_MSA(height)                                         \
+  uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
 
-#define VPX_SAD_8xHEIGHT_MSA(height)                                        \
-uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,    \
-                                 const uint8_t *ref, int32_t ref_stride) {  \
-  return sad_8width_msa(src, src_stride, ref, ref_stride, height);          \
-}
+#define VPX_SAD_8xHEIGHT_MSA(height)                                         \
+  uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
 
-#define VPX_SAD_16xHEIGHT_MSA(height)                                        \
-uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,    \
-                                  const uint8_t *ref, int32_t ref_stride) {  \
-  return sad_16width_msa(src, src_stride, ref, ref_stride, height);          \
-}
+#define VPX_SAD_16xHEIGHT_MSA(height)                                         \
+  uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
 
-#define VPX_SAD_32xHEIGHT_MSA(height)                                        \
-uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,    \
-                                  const uint8_t *ref, int32_t ref_stride) {  \
-  return sad_32width_msa(src, src_stride, ref, ref_stride, height);          \
-}
+#define VPX_SAD_32xHEIGHT_MSA(height)                                         \
+  uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
 
-#define VPX_SAD_64xHEIGHT_MSA(height)                                        \
-uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,    \
-                                  const uint8_t *ref, int32_t ref_stride) {  \
-  return sad_64width_msa(src, src_stride, ref, ref_stride, height);          \
-}
+#define VPX_SAD_64xHEIGHT_MSA(height)                                         \
+  uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
 
-#define VPX_SAD_4xHEIGHTx3_MSA(height)                                  \
-void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
-                               const uint8_t *ref, int32_t ref_stride,  \
-                               uint32_t *sads) {                        \
-  sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
+  void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+                                 const uint8_t *ref, int32_t ref_stride, \
+                                 uint32_t *sads) {                       \
+    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_8xHEIGHTx3_MSA(height)                                  \
-void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
-                               const uint8_t *ref, int32_t ref_stride,  \
-                               uint32_t *sads) {                        \
-  sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
+  void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+                                 const uint8_t *ref, int32_t ref_stride, \
+                                 uint32_t *sads) {                       \
+    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_16xHEIGHTx3_MSA(height)                                  \
-void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *ref, int32_t ref_stride,  \
-                                uint32_t *sads) {                        \
-  sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
+  void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *ref, int32_t ref_stride, \
+                                  uint32_t *sads) {                       \
+    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_32xHEIGHTx3_MSA(height)                                  \
-void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *ref, int32_t ref_stride,  \
-                                uint32_t *sads) {                        \
-  sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_32xHEIGHTx3_MSA(height)                                   \
+  void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *ref, int32_t ref_stride, \
+                                  uint32_t *sads) {                       \
+    sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_64xHEIGHTx3_MSA(height)                                  \
-void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *ref, int32_t ref_stride,  \
-                                uint32_t *sads) {                        \
-  sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_64xHEIGHTx3_MSA(height)                                   \
+  void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *ref, int32_t ref_stride, \
+                                  uint32_t *sads) {                       \
+    sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_4xHEIGHTx8_MSA(height)                                  \
-void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
-                               const uint8_t *ref, int32_t ref_stride,  \
-                               uint32_t *sads) {                        \
-  sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
+  void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+                                 const uint8_t *ref, int32_t ref_stride, \
+                                 uint32_t *sads) {                       \
+    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_8xHEIGHTx8_MSA(height)                                  \
-void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
-                               const uint8_t *ref, int32_t ref_stride,  \
-                               uint32_t *sads) {                        \
-  sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
+  void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+                                 const uint8_t *ref, int32_t ref_stride, \
+                                 uint32_t *sads) {                       \
+    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_16xHEIGHTx8_MSA(height)                                  \
-void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *ref, int32_t ref_stride,  \
-                                uint32_t *sads) {                        \
-  sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
+  void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *ref, int32_t ref_stride, \
+                                  uint32_t *sads) {                       \
+    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_32xHEIGHTx8_MSA(height)                                  \
-void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *ref, int32_t ref_stride,  \
-                                uint32_t *sads) {                        \
-  sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_32xHEIGHTx8_MSA(height)                                   \
+  void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *ref, int32_t ref_stride, \
+                                  uint32_t *sads) {                       \
+    sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_64xHEIGHTx8_MSA(height)                                  \
-void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *ref, int32_t ref_stride,  \
-                                uint32_t *sads) {                        \
-  sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
-}
+#define VPX_SAD_64xHEIGHTx8_MSA(height)                                   \
+  void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *ref, int32_t ref_stride, \
+                                  uint32_t *sads) {                       \
+    sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
+  }
 
-#define VPX_SAD_4xHEIGHTx4D_MSA(height)                                  \
-void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *const refs[],             \
-                                int32_t ref_stride, uint32_t *sads) {    \
-  sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
-}
+#define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
+  void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[],            \
+                                  int32_t ref_stride, uint32_t *sads) {   \
+    sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
 
-#define VPX_SAD_8xHEIGHTx4D_MSA(height)                                  \
-void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
-                                const uint8_t *const refs[],             \
-                                int32_t ref_stride, uint32_t *sads) {    \
-  sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
-}
+#define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
+  void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[],            \
+                                  int32_t ref_stride, uint32_t *sads) {   \
+    sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
 
-#define VPX_SAD_16xHEIGHTx4D_MSA(height)                                  \
-void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
-                                 const uint8_t *const refs[],             \
-                                 int32_t ref_stride, uint32_t *sads) {    \
-  sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
-}
+#define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
+  void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
 
-#define VPX_SAD_32xHEIGHTx4D_MSA(height)                                  \
-void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
-                                 const uint8_t *const refs[],             \
-                                 int32_t ref_stride, uint32_t *sads) {    \
-  sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
-}
+#define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
+  void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
 
-#define VPX_SAD_64xHEIGHTx4D_MSA(height)                                  \
-void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
-                                 const uint8_t *const refs[],             \
-                                 int32_t ref_stride, uint32_t *sads) {    \
-  sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);   \
-}
+#define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
+  void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
 
-#define VPX_AVGSAD_4xHEIGHT_MSA(height)                                       \
-uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
-                                     const uint8_t *ref, int32_t ref_stride,  \
-                                     const uint8_t *second_pred) {            \
-  return avgsad_4width_msa(src, src_stride, ref, ref_stride,                  \
-                           height, second_pred);                              \
-}
+#define VPX_AVGSAD_4xHEIGHT_MSA(height)                                        \
+  uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+                                       const uint8_t *ref, int32_t ref_stride, \
+                                       const uint8_t *second_pred) {           \
+    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
+                             second_pred);                                     \
+  }
 
-#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                       \
-uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
-                                     const uint8_t *ref, int32_t ref_stride,  \
-                                     const uint8_t *second_pred) {            \
-  return avgsad_8width_msa(src, src_stride, ref, ref_stride,                  \
-                           height, second_pred);                              \
-}
+#define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
+  uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+                                       const uint8_t *ref, int32_t ref_stride, \
+                                       const uint8_t *second_pred) {           \
+    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
+                             second_pred);                                     \
+  }
 
-#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                       \
-uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
-                                      const uint8_t *ref, int32_t ref_stride,  \
-                                      const uint8_t *second_pred) {            \
-  return avgsad_16width_msa(src, src_stride, ref, ref_stride,                  \
-                            height, second_pred);                              \
-}
+#define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
+  uint32_t vpx_sad16x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
 
-#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                       \
-uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
-                                      const uint8_t *ref, int32_t ref_stride,  \
-                                      const uint8_t *second_pred) {            \
-  return avgsad_32width_msa(src, src_stride, ref, ref_stride,                  \
-                            height, second_pred);                              \
-}
+#define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
+  uint32_t vpx_sad32x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
 
-#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                       \
-uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride,  \
-                                      const uint8_t *ref, int32_t ref_stride,  \
-                                      const uint8_t *second_pred) {            \
-  return avgsad_64width_msa(src, src_stride, ref, ref_stride,                  \
-                            height, second_pred);                              \
-}
+#define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
+  uint32_t vpx_sad64x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
 
 // 64x64
 VPX_SAD_64xHEIGHT_MSA(64);
diff --git a/vpx_dsp/mips/sub_pixel_variance_msa.c b/vpx_dsp/mips/sub_pixel_variance_msa.c
index a592a2d078e80b361a15c759ecda263f5c811b3b..313e06f92dda0713a0f73d9363c2b079372bf9a8 100644
--- a/vpx_dsp/mips/sub_pixel_variance_msa.c
+++ b/vpx_dsp/mips/sub_pixel_variance_msa.c
@@ -14,29 +14,23 @@
 #include "vpx_dsp/variance.h"
 
 static const uint8_t bilinear_filters_msa[8][2] = {
-  { 128,   0, },
-  { 112,  16, },
-  {  96,  32, },
-  {  80,  48, },
-  {  64,  64, },
-  {  48,  80, },
-  {  32,  96, },
-  {  16, 112, },
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
-#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
-  v16u8 src_l0_m, src_l1_m;                                        \
-  v8i16 res_l0_m, res_l1_m;                                        \
-                                                                   \
-  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
-  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
-  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
-                                                                   \
-  sub += res_l0_m + res_l1_m;                                      \
-}
+#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+                                                                    \
+    sub += res_l0_m + res_l1_m;                                     \
+  }
 
-#define VARIANCE_WxH(sse, diff, shift) \
-  sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
 
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
   sse - (((int64_t)diff * diff) >> shift)
@@ -45,8 +39,7 @@ static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
                                         int32_t src_stride,
                                         const uint8_t *ref_ptr,
                                         int32_t ref_stride,
-                                        const uint8_t *sec_pred,
-                                        int32_t height,
+                                        const uint8_t *sec_pred, int32_t height,
                                         int32_t *diff) {
   int32_t ht_cnt;
   uint32_t src0, src1, src2, src3;
@@ -81,8 +74,7 @@ static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
                                         int32_t src_stride,
                                         const uint8_t *ref_ptr,
                                         int32_t ref_stride,
-                                        const uint8_t *sec_pred,
-                                        int32_t height,
+                                        const uint8_t *sec_pred, int32_t height,
                                         int32_t *diff) {
   int32_t ht_cnt;
   v16u8 src0, src1, src2, src3;
@@ -99,8 +91,8 @@ static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
     ref_ptr += (4 * ref_stride);
 
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                src0, src1, ref0, ref1);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
     CALC_MSE_AVG_B(src0, ref0, var, avg);
     CALC_MSE_AVG_B(src1, ref1, var, avg);
@@ -117,8 +109,7 @@ static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
                                          const uint8_t *ref_ptr,
                                          int32_t ref_stride,
                                          const uint8_t *sec_pred,
-                                         int32_t height,
-                                         int32_t *diff) {
+                                         int32_t height, int32_t *diff) {
   int32_t ht_cnt;
   v16u8 src, ref, pred;
   v8i16 avg = { 0 };
@@ -173,8 +164,7 @@ static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
                                          const uint8_t *ref_ptr,
                                          int32_t ref_stride,
                                          const uint8_t *sec_pred,
-                                         int32_t height,
-                                         int32_t *diff) {
+                                         int32_t height, int32_t *diff) {
   int32_t ht_cnt;
   v16u8 src0, src1, ref0, ref1, pred0, pred1;
   v8i16 avg = { 0 };
@@ -232,8 +222,7 @@ static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
                                        int32_t src_stride,
                                        const uint8_t *ref_ptr,
                                        int32_t ref_stride,
-                                       const uint8_t *sec_pred,
-                                       int32_t *diff) {
+                                       const uint8_t *sec_pred, int32_t *diff) {
   int32_t ht_cnt;
   v16u8 src0, src1, ref0, ref1, pred0, pred1;
   v8i16 avg0 = { 0 };
@@ -293,8 +282,7 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
                                        int32_t src_stride,
                                        const uint8_t *ref_ptr,
                                        int32_t ref_stride,
-                                       const uint8_t *sec_pred,
-                                       int32_t *diff) {
+                                       const uint8_t *sec_pred, int32_t *diff) {
   int32_t ht_cnt;
   v16u8 src0, src1, src2, src3;
   v16u8 ref0, ref1, ref2, ref3;
@@ -310,8 +298,8 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
     src_ptr += src_stride;
     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
     ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
     CALC_MSE_AVG_B(src0, ref0, var, avg0);
     CALC_MSE_AVG_B(src2, ref2, var, avg0);
     CALC_MSE_AVG_B(src1, ref1, var, avg1);
@@ -323,8 +311,8 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
     src_ptr += src_stride;
     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
     ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
     CALC_MSE_AVG_B(src0, ref0, var, avg0);
     CALC_MSE_AVG_B(src2, ref2, var, avg0);
     CALC_MSE_AVG_B(src1, ref1, var, avg1);
@@ -343,8 +331,7 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
                                        int32_t src_stride,
                                        const uint8_t *ref_ptr,
                                        int32_t ref_stride,
-                                       const uint8_t *sec_pred,
-                                       int32_t *diff) {
+                                       const uint8_t *sec_pred, int32_t *diff) {
   int32_t ht_cnt;
   v16u8 src0, src1, src2, src3;
   v16u8 ref0, ref1, ref2, ref3;
@@ -362,8 +349,8 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
     src_ptr += src_stride;
     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
     ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
     CALC_MSE_AVG_B(src0, ref0, var, avg0);
     CALC_MSE_AVG_B(src1, ref1, var, avg1);
     CALC_MSE_AVG_B(src2, ref2, var, avg2);
@@ -375,8 +362,8 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
     src_ptr += src_stride;
     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
     ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
     CALC_MSE_AVG_B(src0, ref0, var, avg0);
     CALC_MSE_AVG_B(src1, ref1, var, avg1);
     CALC_MSE_AVG_B(src2, ref2, var, avg2);
@@ -392,13 +379,9 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const uint8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_4width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   uint32_t ref0, ref1, ref2, ref3;
@@ -420,11 +403,11 @@ static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
     src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
     CALC_MSE_AVG_B(src0, ref, var, avg);
@@ -436,13 +419,9 @@ static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const uint8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_8width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 filt0, out, ref0, ref1, ref2, ref3;
@@ -464,11 +443,11 @@ static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
     CALC_MSE_AVG_B(out, ref0, var, avg);
     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
@@ -481,13 +460,9 @@ static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_16width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -512,14 +487,14 @@ static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                out0, out1, out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
-                out4, out5, out6, out7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
-                src0, src1, src2, src3);
+    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
+                src2, src3);
     CALC_MSE_AVG_B(src0, dst0, var, avg);
     CALC_MSE_AVG_B(src1, dst1, var, avg);
     CALC_MSE_AVG_B(src2, dst2, var, avg);
@@ -532,13 +507,9 @@ static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_32width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[2];
 
@@ -554,13 +525,9 @@ static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_64width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[4];
 
@@ -576,13 +543,9 @@ static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const uint8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_4width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   uint32_t ref0, ref1, ref2, ref3;
@@ -608,8 +571,8 @@ static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
     dst += (4 * dst_stride);
 
     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
-               src10_r, src21_r, src32_r, src43_r);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
@@ -624,13 +587,9 @@ static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const uint8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_8width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4;
@@ -654,10 +613,10 @@ static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
     dst += (4 * dst_stride);
 
     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
-               vec0, vec1, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                tmp0, tmp1, tmp2, tmp3);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
     CALC_MSE_AVG_B(src0, ref0, var, avg);
@@ -671,13 +630,9 @@ static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_16width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 ref0, ref1, ref2, ref3;
@@ -734,13 +689,9 @@ static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_32width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[2];
 
@@ -756,13 +707,9 @@ static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_64width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[4];
 
@@ -778,14 +725,10 @@ static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter_horiz,
-                                                 const uint8_t *filter_vert,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   uint32_t ref0, ref1, ref2, ref3;
@@ -831,14 +774,10 @@ static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const uint8_t *filter_horiz,
-                                                 const uint8_t *filter_vert,
-                                                 int32_t height,
-                                                 int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 ref0, ref1, ref2, ref3;
@@ -892,14 +831,10 @@ static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  const uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  const uint8_t *filter_horiz,
-                                                  const uint8_t *filter_vert,
-                                                  int32_t height,
-                                                  int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -969,14 +904,10 @@ static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  const uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  const uint8_t *filter_horiz,
-                                                  const uint8_t *filter_vert,
-                                                  int32_t height,
-                                                  int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[2];
 
@@ -993,14 +924,10 @@ static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  const uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  const uint8_t *filter_horiz,
-                                                  const uint8_t *filter_vert,
-                                                  int32_t height,
-                                                  int32_t *diff) {
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[4];
 
@@ -1017,14 +944,10 @@ static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const uint8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   uint32_t ref0, ref1, ref2, ref3;
@@ -1049,11 +972,11 @@ static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
     out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
     out = __msa_aver_u_b(out, pred);
@@ -1066,14 +989,10 @@ static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const uint8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 out, pred, filt0;
@@ -1096,11 +1015,11 @@ static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
 
     pred = LD_UB(sec_pred);
@@ -1120,15 +1039,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             const uint8_t *dst,
-                                             int32_t dst_stride,
-                                             const uint8_t *sec_pred,
-                                             const uint8_t *filter,
-                                             int32_t height,
-                                             int32_t *diff,
-                                             int32_t width) {
+static uint32_t subpel_avg_ssediff_16w_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -1157,16 +1071,16 @@ static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                out0, out1, out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
-                out4, out5, out6, out7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
-                tmp0, tmp1, tmp2, tmp3);
-    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
-                tmp0, tmp1, tmp2, tmp3);
+    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
+                tmp2, tmp3);
+    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
+                tmp2, tmp3);
 
     CALC_MSE_AVG_B(tmp0, dst0, var, avg);
     CALC_MSE_AVG_B(tmp1, dst1, var, avg);
@@ -1180,33 +1094,25 @@ static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const uint8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
                                       sec_pred, filter, height, diff, 16);
 }
 
-static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const uint8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[2];
 
   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 32);
+    sse +=
+        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 32);
     src += 16;
     dst += 16;
     sec_pred += 16;
@@ -1217,21 +1123,17 @@ static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const uint8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[4];
 
   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 64);
+    sse +=
+        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
     src += 16;
     dst += 16;
     sec_pred += 16;
@@ -1242,14 +1144,10 @@ static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const uint8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   uint32_t ref0, ref1, ref2, ref3;
@@ -1276,8 +1174,8 @@ static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
     dst += (4 * dst_stride);
 
     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
-               src10_r, src21_r, src32_r, src43_r);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
@@ -1294,14 +1192,10 @@ static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const uint8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4;
@@ -1326,10 +1220,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
     dst += (4 * dst_stride);
     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
-               vec0, vec1, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                tmp0, tmp1, tmp2, tmp3);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
@@ -1345,15 +1239,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             const uint8_t *dst,
-                                             int32_t dst_stride,
-                                             const uint8_t *sec_pred,
-                                             const uint8_t *filter,
-                                             int32_t height,
-                                             int32_t *diff,
-                                             int32_t width) {
+static uint32_t subpel_avg_ssediff_16w_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 ref0, ref1, ref2, ref3;
@@ -1401,8 +1290,8 @@ static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
     dst += (4 * dst_stride);
 
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
-                out0, out1, out2, out3);
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+                out2, out3);
 
     CALC_MSE_AVG_B(out0, ref0, var, avg);
     CALC_MSE_AVG_B(out1, ref1, var, avg);
@@ -1416,33 +1305,25 @@ static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
   return HADD_SW_S32(var);
 }
 
-static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const uint8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
                                       sec_pred, filter, height, diff, 16);
 }
 
-static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const uint8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[2];
 
   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 32);
+    sse +=
+        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 32);
     src += 16;
     dst += 16;
     sec_pred += 16;
@@ -1453,21 +1334,17 @@ static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
   return sse;
 }
 
-static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const uint8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[4];
 
   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 64);
+    sse +=
+        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
     src += 16;
     dst += 16;
     sec_pred += 16;
@@ -1479,11 +1356,9 @@ static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
 }
 
 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const uint8_t *filter_horiz, const uint8_t *filter_vert,
-  int32_t height, int32_t *diff) {
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   uint32_t ref0, ref1, ref2, ref3;
@@ -1532,11 +1407,9 @@ static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
 }
 
 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const uint8_t *filter_horiz, const uint8_t *filter_vert,
-  int32_t height, int32_t *diff) {
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 ref0, ref1, ref2, ref3;
@@ -1598,16 +1471,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
   return HADD_SW_S32(var);
 }
 
-static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              const uint8_t *dst,
-                                              int32_t dst_stride,
-                                              const uint8_t *sec_pred,
-                                              const uint8_t *filter_horiz,
-                                              const uint8_t *filter_vert,
-                                              int32_t height,
-                                              int32_t *diff,
-                                              int32_t width) {
+static uint32_t subpel_avg_ssediff_16w_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
   int16_t filtval;
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -1669,8 +1536,8 @@ static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
     dst += (4 * dst_stride);
 
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
-                out0, out1, out2, out3);
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+                out2, out3);
 
     CALC_MSE_AVG_B(out0, ref0, var, avg);
     CALC_MSE_AVG_B(out1, ref1, var, avg);
@@ -1685,22 +1552,18 @@ static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
 }
 
 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const uint8_t *filter_horiz, const uint8_t *filter_vert,
-  int32_t height, int32_t *diff) {
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
                                        sec_pred, filter_horiz, filter_vert,
                                        height, diff, 16);
 }
 
 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const uint8_t *filter_horiz, const uint8_t *filter_vert,
-  int32_t height, int32_t *diff) {
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[2];
 
@@ -1719,11 +1582,9 @@ static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
 }
 
 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const uint8_t *filter_horiz, const uint8_t *filter_vert,
-  int32_t height, int32_t *diff) {
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
   uint32_t loop_cnt, sse = 0;
   int32_t diff0[4];
 
@@ -1756,47 +1617,40 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
 
-#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
-uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
-                                                 int32_t src_stride,     \
-                                                 int32_t xoffset,        \
-                                                 int32_t yoffset,        \
-                                                 const uint8_t *ref,     \
-                                                 int32_t ref_stride,     \
-                                                 uint32_t *sse) {        \
-  int32_t diff;                                                          \
-  uint32_t var;                                                          \
-  const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \
-  const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \
-                                                                         \
-  if (yoffset) {                                                         \
-    if (xoffset) {                                                       \
-      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \
-                                                   ref, ref_stride,      \
-                                                   h_filter, v_filter,   \
-                                                   ht, &diff);           \
-    } else {                                                             \
-      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \
-                                                  ref, ref_stride,       \
-                                                  v_filter, ht, &diff);  \
-    }                                                                    \
-                                                                         \
-    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \
-  } else {                                                               \
-    if (xoffset) {                                                       \
-      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \
-                                                  ref, ref_stride,       \
-                                                  h_filter, ht, &diff);  \
-                                                                         \
-      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \
-    } else {                                                             \
-      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \
-                                          ref, ref_stride, sse);         \
-    }                                                                    \
-  }                                                                      \
-                                                                         \
-  return var;                                                            \
-}
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
+  uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(                           \
+      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
+      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
+      uint32_t *sse) {                                                        \
+    int32_t diff;                                                             \
+    uint32_t var;                                                             \
+    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
+    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
+            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+      } else {                                                                \
+        *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
+            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
+      }                                                                       \
+                                                                              \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
+            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
+                                                                              \
+        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
+      } else {                                                                \
+        var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return var;                                                               \
+  }
 
 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
@@ -1817,42 +1671,37 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
 
 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
-uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                         \
-  const uint8_t *src_ptr, int32_t src_stride,                                 \
-  int32_t xoffset, int32_t yoffset,                                           \
-  const uint8_t *ref_ptr, int32_t ref_stride,                                 \
-  uint32_t *sse, const uint8_t *sec_pred) {                                   \
-  int32_t diff;                                                               \
-  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                    \
-  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                    \
+  uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
+      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
+    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
                                                                               \
-  if (yoffset) {                                                              \
-    if (xoffset) {                                                            \
-      *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride,   \
-                                                       ref_ptr, ref_stride,   \
-                                                       sec_pred, h_filter,    \
-                                                       v_filter, ht, &diff);  \
-    } else {                                                                  \
-      *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride,    \
-                                                      ref_ptr, ref_stride,    \
-                                                      sec_pred, v_filter,     \
-                                                      ht, &diff);             \
-    }                                                                         \
-  } else {                                                                    \
-    if (xoffset) {                                                            \
-      *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride,    \
-                                                      ref_ptr, ref_stride,    \
-                                                      sec_pred, h_filter,     \
-                                                      ht, &diff);             \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
     } else {                                                                  \
-      *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride,                \
-                                          ref_ptr, ref_stride,                \
-                                          sec_pred, ht, &diff);               \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
+                                            ref_stride, sec_pred, ht, &diff); \
+      }                                                                       \
     }                                                                         \
-  }                                                                           \
                                                                               \
-  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                                \
-}
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
+  }
 
 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
@@ -1870,11 +1719,9 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
 
 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
                                              int32_t src_stride,
-                                             int32_t xoffset,
-                                             int32_t yoffset,
+                                             int32_t xoffset, int32_t yoffset,
                                              const uint8_t *ref_ptr,
-                                             int32_t ref_stride,
-                                             uint32_t *sse,
+                                             int32_t ref_stride, uint32_t *sse,
                                              const uint8_t *sec_pred) {
   int32_t diff;
   const uint8_t *h_filter = bilinear_filters_msa[xoffset];
@@ -1882,22 +1729,19 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
 
   if (yoffset) {
     if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
-                                                   ref_ptr, ref_stride,
-                                                   sec_pred, h_filter,
-                                                   v_filter, 64, &diff);
+      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
+          src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
+          v_filter, 64, &diff);
     } else {
-      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
-                                                  ref_ptr, ref_stride,
-                                                  sec_pred, v_filter,
-                                                  64, &diff);
+      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
+                                                  ref_stride, sec_pred,
+                                                  v_filter, 64, &diff);
     }
   } else {
     if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
-                                                  ref_ptr, ref_stride,
-                                                  sec_pred, h_filter,
-                                                  64, &diff);
+      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
+                                                  ref_stride, sec_pred,
+                                                  h_filter, 64, &diff);
     } else {
       *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
                                     sec_pred, &diff);
@@ -1907,46 +1751,38 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
   return VARIANCE_32Wx64H(*sse, diff);
 }
 
-#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                          \
-uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr,     \
-                                                 int32_t src_stride,         \
-                                                 int32_t xoffset,            \
-                                                 int32_t yoffset,            \
-                                                 const uint8_t *ref_ptr,     \
-                                                 int32_t ref_stride,         \
-                                                 uint32_t *sse,              \
-                                                 const uint8_t *sec_pred) {  \
-  int32_t diff;                                                              \
-  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                   \
-  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                   \
-                                                                             \
-  if (yoffset) {                                                             \
-    if (xoffset) {                                                           \
-      *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride,      \
-                                                   ref_ptr, ref_stride,      \
-                                                   sec_pred, h_filter,       \
-                                                   v_filter, ht, &diff);     \
-    } else {                                                                 \
-      *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride,       \
-                                                  ref_ptr, ref_stride,       \
-                                                  sec_pred, v_filter,        \
-                                                  ht, &diff);                \
-    }                                                                        \
-  } else {                                                                   \
-    if (xoffset) {                                                           \
-      *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride,       \
-                                                  ref_ptr, ref_stride,       \
-                                                  sec_pred, h_filter,        \
-                                                  ht, &diff);                \
-    } else {                                                                 \
-      *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride,                 \
-                                        ref_ptr, ref_stride,                 \
-                                        sec_pred, &diff);                    \
-    }                                                                        \
-  }                                                                          \
-                                                                             \
-  return VARIANCE_64Wx##ht##H(*sse, diff);                                   \
-}
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
+  uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(                           \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
+      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
+    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
+                                          ref_stride, sec_pred, &diff);       \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
+  }
 
 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
diff --git a/vpx_dsp/mips/subtract_msa.c b/vpx_dsp/mips/subtract_msa.c
index 9ac43c5cd5225c8a579d47f3ccd31f38bb47de66..391a7ebf66251daec485a91e7ca936a4996cc42b 100644
--- a/vpx_dsp/mips/subtract_msa.c
+++ b/vpx_dsp/mips/subtract_msa.c
@@ -68,8 +68,8 @@ static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
     src += (8 * src_stride);
 
-    LD_SB8(pred, pred_stride,
-           pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);
+    LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
+           pred7);
     pred += (8 * pred_stride);
 
     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
@@ -226,31 +226,31 @@ static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-void vpx_subtract_block_msa(int32_t rows, int32_t cols,
-                            int16_t *diff_ptr, ptrdiff_t diff_stride,
-                            const uint8_t *src_ptr, ptrdiff_t src_stride,
-                            const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
+                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                            ptrdiff_t pred_stride) {
   if (rows == cols) {
     switch (rows) {
       case 4:
-        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,
-                        diff_ptr, diff_stride);
+        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
         break;
       case 8:
-        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,
-                        diff_ptr, diff_stride);
+        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
         break;
       case 16:
-        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,
-                          diff_ptr, diff_stride);
+        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
         break;
       case 32:
-        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,
-                          diff_ptr, diff_stride);
+        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
         break;
       case 64:
-        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,
-                          diff_ptr, diff_stride);
+        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
         break;
       default:
         vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
diff --git a/vpx_dsp/mips/txfm_macros_msa.h b/vpx_dsp/mips/txfm_macros_msa.h
index 68c63d56f67f524311ff62ae8aca30a7a4758b3c..da100f6a9808af6c3105057a9662159f884fe3d9 100644
--- a/vpx_dsp/mips/txfm_macros_msa.h
+++ b/vpx_dsp/mips/txfm_macros_msa.h
@@ -13,81 +13,84 @@
 
 #include "vpx_dsp/mips/macros_msa.h"
 
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {      \
-  v8i16 k0_m = __msa_fill_h(cnst0);                                  \
-  v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
-                                                                     \
-  s0_m = (v4i32)__msa_fill_h(cnst1);                                 \
-  k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                           \
-                                                                     \
-  ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                            \
-  ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                               \
-  DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);                   \
-  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
-  out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
-                                                                     \
-  DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);                   \
-  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
-  out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
-}
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
+  {                                                           \
+    v8i16 k0_m = __msa_fill_h(cnst0);                         \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                             \
+                                                              \
+    s0_m = (v4i32)__msa_fill_h(cnst1);                        \
+    k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                  \
+                                                              \
+    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                   \
+    ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                      \
+    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);          \
+    SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
+    out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
+                                                              \
+    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);          \
+    SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
+    out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
+  }
 
-#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7,      \
-                              dst0, dst1, dst2, dst3) {                    \
-  v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                                 \
-  v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                                 \
-                                                                           \
-  DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5,                      \
-              tp0_m, tp2_m, tp3_m, tp4_m);                                 \
-  DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7,                      \
-              tp5_m, tp6_m, tp7_m, tp8_m);                                 \
-  BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);     \
-  BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);     \
-  SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS);                 \
-  SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS);                 \
-  PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m,      \
-              dst0, dst1, dst2, dst3);                                     \
-}
+#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0,   \
+                              dst1, dst2, dst3)                               \
+  {                                                                           \
+    v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                                  \
+    v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                                  \
+                                                                              \
+    DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m,  \
+                tp4_m);                                                       \
+    DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m,  \
+                tp8_m);                                                       \
+    BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);      \
+    BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);      \
+    SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS);                  \
+    SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS);                  \
+    PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
+                dst1, dst2, dst3);                                            \
+  }
 
-#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({       \
-  v8i16 dst_m;                                        \
-  v4i32 tp0_m, tp1_m;                                 \
-                                                      \
-  DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);      \
-  SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS);          \
-  dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m);  \
-                                                      \
-  dst_m;                                              \
-})
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2)           \
+  ({                                                   \
+    v8i16 dst_m;                                       \
+    v4i32 tp0_m, tp1_m;                                \
+                                                       \
+    DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);     \
+    SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS);         \
+    dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
+                                                       \
+    dst_m;                                             \
+  })
 
-#define MADD_SHORT(m0, m1, c0, c1, res0, res1) {                    \
-  v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                         \
-  v8i16 madd_s0_m, madd_s1_m;                                       \
-                                                                    \
-  ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                        \
-  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,           \
-              c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);  \
-  SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);  \
-  PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);      \
-}
+#define MADD_SHORT(m0, m1, c0, c1, res0, res1)                              \
+  {                                                                         \
+    v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                               \
+    v8i16 madd_s0_m, madd_s1_m;                                             \
+                                                                            \
+    ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                              \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
+                madd0_m, madd1_m, madd2_m, madd3_m);                        \
+    SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);        \
+    PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);            \
+  }
 
-#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,         \
-                out0, out1, out2, out3) {                               \
-  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
-  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                     \
-                                                                        \
-  ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                        \
-  ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                        \
-  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
-              cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
-  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
-              m4_m, m5_m, tmp3_m, tmp2_m);                              \
-  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                  \
-  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
-              cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
-  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
-              m4_m, m5_m, tmp3_m, tmp2_m);                              \
-  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                  \
-}
+#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,   \
+                out2, out3)                                                   \
+  {                                                                           \
+    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
+    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                         \
+                                                                              \
+    ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                            \
+    ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                            \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
+                cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                      \
+    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
+                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
+    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
+    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
+    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                      \
+  }
 #endif  // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
diff --git a/vpx_dsp/mips/variance_msa.c b/vpx_dsp/mips/variance_msa.c
index 33e175560fd3207fc86917095ca6c03d393a670e..085990e48459f24365c18e842f88bcc2a201928e 100644
--- a/vpx_dsp/mips/variance_msa.c
+++ b/vpx_dsp/mips/variance_msa.c
@@ -11,28 +11,29 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
-#define CALC_MSE_B(src, ref, var) {                                \
-  v16u8 src_l0_m, src_l1_m;                                        \
-  v8i16 res_l0_m, res_l1_m;                                        \
-                                                                   \
-  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
-  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
-  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
-}
+#define CALC_MSE_B(src, ref, var)                                   \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+  }
 
-#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
-  v16u8 src_l0_m, src_l1_m;                                        \
-  v8i16 res_l0_m, res_l1_m;                                        \
-                                                                   \
-  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
-  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
-  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
-                                                                   \
-  sub += res_l0_m + res_l1_m;                                      \
-}
+#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+                                                                    \
+    sub += res_l0_m + res_l1_m;                                     \
+  }
 
-#define VARIANCE_WxH(sse, diff, shift) \
-  sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
 
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
   sse - (((int64_t)diff * diff) >> shift)
@@ -80,8 +81,8 @@ static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
     ref_ptr += (4 * ref_stride);
 
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                src0, src1, ref0, ref1);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
     CALC_MSE_AVG_B(src0, ref0, var, avg);
     CALC_MSE_AVG_B(src1, ref1, var, avg);
   }
@@ -370,8 +371,8 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
     ref_ptr += (4 * ref_stride);
 
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                src0, src1, ref0, ref1);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
     CALC_MSE_B(src0, ref0, var);
     CALC_MSE_B(src1, ref1, var);
   }
@@ -526,19 +527,17 @@ uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
 
-#define VPX_VARIANCE_WDXHT_MSA(wd, ht)                               \
-uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src,           \
-                                       int32_t src_stride,           \
-                                       const uint8_t *ref,           \
-                                       int32_t ref_stride,           \
-                                       uint32_t *sse) {              \
-  int32_t diff;                                                      \
-                                                                     \
-  *sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride,  \
-                                  ht, &diff);                        \
-                                                                     \
-  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                       \
-}
+#define VPX_VARIANCE_WDXHT_MSA(wd, ht)                                         \
+  uint32_t vpx_variance##wd##x##ht##_msa(                                      \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
+      int32_t ref_stride, uint32_t *sse) {                                     \
+    int32_t diff;                                                              \
+                                                                               \
+    *sse =                                                                     \
+        sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
+                                                                               \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
+  }
 
 VPX_VARIANCE_WDXHT_MSA(4, 4);
 VPX_VARIANCE_WDXHT_MSA(4, 8);
@@ -585,8 +584,7 @@ uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
 }
 
 uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
-                        const uint8_t *ref, int32_t ref_stride,
-                        uint32_t *sse) {
+                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
   *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
 
   return *sse;
@@ -617,17 +615,15 @@ uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
 }
 
 void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
-                       const uint8_t *ref, int32_t ref_stride,
-                       uint32_t *sse, int32_t *sum) {
+                       const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                       int32_t *sum) {
   *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
 }
 
 void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
-                         const uint8_t *ref, int32_t ref_stride,
-                         uint32_t *sse, int32_t *sum) {
+                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                         int32_t *sum) {
   *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
 }
 
-uint32_t vpx_get_mb_ss_msa(const int16_t *src) {
-  return get_mb_ss_msa(src);
-}
+uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index f6244d834b7d4f24acd22c1b961071c62e71e798..ad2af286692155bccbee8df80abbbd41dee655d6 100644
--- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -13,8 +13,7 @@
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@@ -48,8 +47,7 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
 }
 
 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@@ -92,10 +90,8 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
 }
 
 static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   if (4 == height) {
     common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -105,10 +101,8 @@ static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
 }
 
 static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   int32_t loop_cnt;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
@@ -136,18 +130,16 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
-                            dst, dst_stride);
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+                            dst_stride);
     dst += (4 * dst_stride);
   }
 }
 
 static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   int32_t loop_cnt;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
   v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
@@ -199,11 +191,9 @@ static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
 }
 
 static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
   v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
@@ -256,11 +246,9 @@ static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
 }
 
 static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt, cnt;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
   v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
@@ -318,8 +306,7 @@ static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
 }
 
 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, mask;
@@ -344,8 +331,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
 }
 
 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
@@ -378,10 +364,8 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
 }
 
 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   if (4 == height) {
     common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -391,8 +375,7 @@ static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
 }
 
 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, mask;
@@ -412,16 +395,13 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
-                     dst, dst_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+                     dst_stride);
 }
 
-static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  int8_t *filter,
-                                                  int32_t height) {
+static void common_hz_2t_and_aver_dst_8x8mult_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
   v16i8 src0, src1, src2, src3, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
   v8u16 vec0, vec1, vec2, vec3, filt;
@@ -442,8 +422,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
-                     dst, dst_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+                     dst_stride);
   dst += (4 * dst_stride);
 
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -452,8 +432,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
-                     dst, dst_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+                     dst_stride);
   dst += (4 * dst_stride);
 
   if (16 == height) {
@@ -467,8 +447,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
     LD_SB4(src, src_stride, src0, src1, src2, src3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
-                       dst, dst_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+                       dst_stride);
     dst += (4 * dst_stride);
 
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -477,16 +457,14 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
-                       dst, dst_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
+                       dst_stride);
   }
 }
 
 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   if (4 == height) {
     common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -497,11 +475,9 @@ static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
 }
 
 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
@@ -566,11 +542,9 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
 }
 
 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
@@ -617,11 +591,9 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
 }
 
 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
@@ -662,8 +634,8 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h) {
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h) {
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -676,67 +648,55 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
   if (((const int32_t *)filter_x)[0] == 0) {
     switch (w) {
       case 4:
-        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         &filt_hor[3], h);
+        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
         break;
       case 8:
-        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         &filt_hor[3], h);
+        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
         break;
       case 16:
-        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          &filt_hor[3], h);
+        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
         break;
       case 32:
-        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          &filt_hor[3], h);
+        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
         break;
       case 64:
-        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          &filt_hor[3], h);
+        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4,
-                                  w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                                  x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   } else {
     switch (w) {
       case 4:
-        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         filt_hor, h);
+        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
         break;
       case 8:
-        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         filt_hor, h);
+        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
         break;
       case 16:
-        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          filt_hor, h);
+        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
         break;
       case 32:
-        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          filt_hor, h);
+        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
         break;
       case 64:
-        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          filt_hor, h);
+        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4,
-                                  w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                                  x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index 2abde6de83c3e14131866426cb21721764d76cdc..1cfa63201c5acfefbf1170ec42d902be0c26edea 100644
--- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -12,13 +12,9 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
 
-static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  int8_t *filter_horiz,
-                                                  int8_t *filter_vert,
-                                                  int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
@@ -64,15 +60,15 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
     src += (4 * src_stride);
 
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
     res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
 
-    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
     res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
@@ -94,13 +90,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  int8_t *filter_horiz,
-                                                  int8_t *filter_vert,
-                                                  int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
@@ -154,20 +146,20 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
 
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
-    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
     tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
 
-    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
     tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
 
-    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
     tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
@@ -180,8 +172,8 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
 
     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
-                            dst, dst_stride);
+    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
+                            dst_stride);
     dst += (4 * dst_stride);
 
     hz_out6 = hz_out10;
@@ -194,13 +186,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert,
-                                                   int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   int32_t multiple8_cnt;
   for (multiple8_cnt = 2; multiple8_cnt--;) {
     common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@@ -210,13 +198,9 @@ static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert,
-                                                   int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   int32_t multiple8_cnt;
   for (multiple8_cnt = 4; multiple8_cnt--;) {
     common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@@ -226,13 +210,9 @@ static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert,
-                                                   int32_t height) {
+static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   int32_t multiple8_cnt;
   for (multiple8_cnt = 8; multiple8_cnt--;) {
     common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
@@ -242,12 +222,9 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert) {
+static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
   v16i8 src0, src1, src2, src3, src4, mask;
   v16u8 filt_hz, filt_vt, vec0, vec1;
   v16u8 dst0, dst1, dst2, dst3, res0, res1;
@@ -280,12 +257,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src,
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert) {
+static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
   v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -316,29 +290,25 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
   hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
 
   LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
-             dst4, dst6);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
+             dst6);
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
-              tmp0, tmp1, tmp2, tmp3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
+              tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
-              res2, res3);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
-              res2, res3);
+  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
+              res3);
+  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
+              res3);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
   dst += (4 * dst_stride);
   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  int8_t *filter_horiz,
-                                                  int8_t *filter_vert,
-                                                  int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   if (4 == height) {
     common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
                                            filter_horiz, filter_vert);
@@ -348,12 +318,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert) {
+static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
   v16i8 src0, src1, src2, src3, src4, mask;
   v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
@@ -390,17 +357,13 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src,
   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
-                     dst, dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+                     dst_stride);
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
-                                                       int32_t src_stride,
-                                                       uint8_t *dst,
-                                                       int32_t dst_stride,
-                                                       int8_t *filter_horiz,
-                                                       int8_t *filter_vert,
-                                                       int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, mask;
   v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
@@ -445,36 +408,27 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src,
 
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
-                       dst, dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+                       dst_stride);
     dst += (4 * dst_stride);
   }
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  int8_t *filter_horiz,
-                                                  int8_t *filter_vert,
-                                                  int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   if (4 == height) {
     common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
                                            filter_horiz, filter_vert);
   } else {
-    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
-                                               filter_horiz, filter_vert,
-                                               height);
+    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
+        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
   }
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert,
-                                                   int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
@@ -536,13 +490,9 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert,
-                                                   int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   int32_t multiple8_cnt;
   for (multiple8_cnt = 2; multiple8_cnt--;) {
     common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
@@ -552,13 +502,9 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src,
   }
 }
 
-static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter_horiz,
-                                                   int8_t *filter_vert,
-                                                   int32_t height) {
+static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   int32_t multiple8_cnt;
   for (multiple8_cnt = 4; multiple8_cnt--;) {
     common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
@@ -571,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src,
 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h) {
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -589,72 +535,69 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
       ((const int32_t *)filter_y)[0] == 0) {
     switch (w) {
       case 4:
-        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
-                                              dst, (int32_t)dst_stride,
-                                              &filt_hor[3], &filt_ver[3], h);
+        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
         break;
       case 8:
-        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
-                                              dst, (int32_t)dst_stride,
-                                              &filt_hor[3], &filt_ver[3], h);
+        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
         break;
       case 16:
-        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
-                                               dst, (int32_t)dst_stride,
+        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
                                                &filt_hor[3], &filt_ver[3], h);
         break;
       case 32:
-        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
-                                               dst, (int32_t)dst_stride,
+        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
                                                &filt_hor[3], &filt_ver[3], h);
         break;
       case 64:
-        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
-                                               dst, (int32_t)dst_stride,
+        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
                                                &filt_hor[3], &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                            filter_x, x_step_q4, filter_y, y_step_q4,
-                            w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4,
-                        w, h);
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                        filter_y, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
-        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride,
-                                              dst, (int32_t)dst_stride,
-                                              filt_hor, filt_ver, h);
+        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
         break;
       case 8:
-        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride,
-                                              dst, (int32_t)dst_stride,
-                                              filt_hor, filt_ver, h);
+        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
         break;
       case 16:
-        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride,
-                                               dst, (int32_t)dst_stride,
-                                               filt_hor, filt_ver, h);
+        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
         break;
       case 32:
-        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride,
-                                               dst, (int32_t)dst_stride,
-                                               filt_hor, filt_ver, h);
+        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
         break;
       case 64:
-        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride,
-                                               dst, (int32_t)dst_stride,
-                                               filt_hor, filt_ver, h);
+        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                            filter_x, x_step_q4, filter_y, y_step_q4,
-                            w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index 0164e41aa161a3c5614b86c2ac71687089a29083..146ce3b2f596d63ffc2bcc01ae03da08cdf9cbc4 100644
--- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -13,10 +13,8 @@
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
 
 static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
@@ -73,10 +71,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
 }
 
 static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
@@ -106,18 +102,18 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
     XORI_B4_128_SB(src7, src8, src9, src10);
     ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
                src87_r, src98_r, src109_r);
-    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                               filt1, filt2, filt3);
-    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                               filt1, filt2, filt3);
-    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                               filt1, filt2, filt3);
+    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
+                               filt2, filt3);
+    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
+                               filt2, filt3);
+    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
+                               filt2, filt3);
     out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
                                filt1, filt2, filt3);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
-                            dst, dst_stride);
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+                            dst_stride);
     dst += (4 * dst_stride);
 
     src10_r = src54_r;
@@ -130,13 +126,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
   }
 }
 
-static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
-                                                   int32_t src_stride,
-                                                   uint8_t *dst,
-                                                   int32_t dst_stride,
-                                                   int8_t *filter,
-                                                   int32_t height,
-                                                   int32_t width) {
+static void common_vt_8t_and_aver_dst_16w_mult_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height, int32_t width) {
   const uint8_t *src_tmp;
   uint8_t *dst_tmp;
   uint32_t loop_cnt, cnt;
@@ -227,38 +219,31 @@ static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src,
 }
 
 static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
                                          filter, height, 16);
 }
 
 static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
                                          filter, height, 32);
 }
 
 static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
                                          filter, height, 64);
 }
 
 static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, src4;
@@ -292,8 +277,7 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
 }
 
 static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -311,15 +295,15 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   src8 = LD_SB(src);
 
   LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
-             dst2, dst3);
+  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
+             dst3);
   ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
   ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
              src76_r, src87_r);
-  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
-             src87_r, src76_r, src2110, src4332, src6554, src8776);
+  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+             src76_r, src2110, src4332, src6554, src8776);
   DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
               tmp0, tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@@ -331,10 +315,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
 }
 
 static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   if (4 == height) {
     common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -344,8 +326,7 @@ static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
 }
 
 static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
   v16u8 src0, src1, src2, src3, src4;
@@ -364,16 +345,13 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
               tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
-                     dst, dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
+                     dst_stride);
 }
 
-static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  int8_t *filter,
-                                                  int32_t height) {
+static void common_vt_2t_and_aver_dst_8x8mult_msa(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
   v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
@@ -393,22 +371,22 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
     src += (8 * src_stride);
     LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
 
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
-               vec2, vec3);
-    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
-               vec6, vec7);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+               vec7);
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
-                       dst, dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
+                       dst_stride);
     dst += (4 * dst_stride);
 
     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
-                       dst, dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
+                       dst_stride);
     dst += (4 * dst_stride);
 
     src0 = src8;
@@ -416,10 +394,8 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src,
 }
 
 static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             uint8_t *dst,
-                                             int32_t dst_stride,
-                                             int8_t *filter,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   if (4 == height) {
     common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
@@ -430,11 +406,9 @@ static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
 }
 
 static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -481,11 +455,9 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
 }
 
 static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -554,11 +526,9 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
 }
 
 static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              uint8_t *dst,
+                                              int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
-                                              int8_t *filter,
-                                              int32_t height) {
+                                              int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4, src5;
   v16u8 src6, src7, src8, src9, src10, src11, filt0;
@@ -636,8 +606,8 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
 void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
+                                const int16_t *filter_y, int y_step_q4, int w,
+                                int h) {
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -650,68 +620,56 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
   if (((const int32_t *)filter_y)[0] == 0) {
     switch (w) {
       case 4:
-        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         &filt_ver[3], h);
+        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
         break;
       case 8:
-        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         &filt_ver[3], h);
+        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
         break;
       case 16:
-        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          &filt_ver[3], h);
+        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
         break;
       case 32:
-        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          &filt_ver[3], h);
+        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
         break;
       case 64:
-        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          &filt_ver[3], h);
+        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   } else {
     switch (w) {
       case 4:
-        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         filt_ver, h);
+        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
         break;
       case 8:
-        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride,
-                                         dst, (int32_t)dst_stride,
-                                         filt_ver, h);
+        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
         break;
       case 16:
-        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          filt_ver, h);
+        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
 
         break;
       case 32:
-        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          filt_ver, h);
+        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
         break;
       case 64:
-        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride,
-                                          dst, (int32_t)dst_stride,
-                                          filt_ver, h);
+        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
-                                 filter_x, x_step_q4, filter_y, y_step_q4,
-                                 w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index dbd120b0d5c546629f1b1175147e8c9169b45947..9e8bf7b5194374ac0d384bb690e8649c4fb353ec 100644
--- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -325,7 +325,7 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
 
   /* rearranging filter */
   filt = LD_UH(filter);
-  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@@ -347,7 +347,7 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
 
   /* rearranging filter */
   filt = LD_UH(filter);
-  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@@ -355,8 +355,8 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
               vec6, vec7);
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
-              res2, res3);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+              res3);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
   dst += (4 * dst_stride);
   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
@@ -383,7 +383,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
 
   /* rearranging filter */
   filt = LD_UH(filter);
-  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -406,7 +406,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
 
   /* rearranging filter */
   filt = LD_UH(filter);
-  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
@@ -482,7 +482,7 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
 
   /* rearranging filter */
   filt = LD_UH(filter);
-  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB4(src, src_stride, src0, src2, src4, src6);
   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
@@ -545,7 +545,7 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
 
   /* rearranging filter */
   filt = LD_UH(filter);
-  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   for (loop_cnt = height >> 1; loop_cnt--;) {
     src0 = LD_SB(src);
@@ -590,7 +590,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 
   /* rearranging filter */
   filt = LD_UH(filter);
-  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   for (loop_cnt = height; loop_cnt--;) {
     src0 = LD_SB(src);
@@ -622,8 +622,8 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h) {
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -636,67 +636,55 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
   if (((const int32_t *)filter_x)[0] == 0) {
     switch (w) {
       case 4:
-        common_hz_2t_4w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             &filt_hor[3], h);
         break;
       case 8:
-        common_hz_2t_8w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             &filt_hor[3], h);
         break;
       case 16:
-        common_hz_2t_16w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              &filt_hor[3], h);
         break;
       case 32:
-        common_hz_2t_32w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              &filt_hor[3], h);
         break;
       case 64:
-        common_hz_2t_64w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   } else {
     switch (w) {
       case 4:
-        common_hz_8t_4w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             filt_hor, h);
         break;
       case 8:
-        common_hz_8t_8w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             filt_hor, h);
         break;
       case 16:
-        common_hz_8t_16w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              filt_hor, h);
         break;
       case 32:
-        common_hz_8t_32w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              filt_hor, h);
         break;
       case 64:
-        common_hz_8t_64w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              filt_hor, h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                              filter_x, x_step_q4, filter_y, y_step_q4,
-                              w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/vpx_convolve8_msa.c b/vpx_dsp/mips/vpx_convolve8_msa.c
index 7546f13150095c355b8fd1a200071e668b96978e..b16ec57886a533a1283c7f440f5bd24b2b2f34d5 100644
--- a/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -69,15 +69,15 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
     XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
     tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
 
-    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
     out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
     tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
@@ -151,20 +151,20 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
 
     XORI_B4_128_SB(src7, src8, src9, src10);
 
-    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
     tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
 
-    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
     tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
 
-    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
-                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                              filt_hz1, filt_hz2, filt_hz3);
     out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
     tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
@@ -295,11 +295,11 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
 
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
-              vec4, vec5, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
+              vec5, vec6, vec7);
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
-              res2, res3);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+              res3);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
   dst += (4 * dst_stride);
   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
@@ -361,12 +361,10 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
 }
 
 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
-                                          int32_t src_stride,
-                                          uint8_t *dst,
+                                          int32_t src_stride, uint8_t *dst,
                                           int32_t dst_stride,
                                           int8_t *filter_horiz,
-                                          int8_t *filter_vert,
-                                          int32_t height) {
+                                          int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
   v16u8 filt_hz, filt_vt, vec0;
@@ -542,11 +540,10 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, ptrdiff_t dst_stride,
-                       const int16_t *filter_x, int32_t x_step_q4,
-                       const int16_t *filter_y, int32_t y_step_q4,
-                       int32_t w, int32_t h) {
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const int16_t *filter_x,
+                       int32_t x_step_q4, const int16_t *filter_y,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -563,72 +560,69 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
       ((const int32_t *)filter_y)[0] == 0) {
     switch (w) {
       case 4:
-        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
         break;
       case 8:
-        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
         break;
       case 16:
-        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
         break;
       case 32:
-        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
         break;
       case 64:
-        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4,
-                        w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                        filter_y, y_step_q4, w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_c(src, src_stride, dst, dst_stride,
-                    filter_x, x_step_q4, filter_y, y_step_q4,
-                    w, h);
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                    filter_y, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
-        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 filt_hor, filt_ver, (int32_t)h);
+        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
         break;
       case 8:
-        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
-                                 filt_hor, filt_ver, (int32_t)h);
+        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
         break;
       case 16:
-        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filt_hor, filt_ver, (int32_t)h);
+        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
         break;
       case 32:
-        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filt_hor, filt_ver, (int32_t)h);
+        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
         break;
       case 64:
-        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
-                                  filt_hor, filt_ver, (int32_t)h);
+        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride,
-                        filter_x, x_step_q4, filter_y, y_step_q4,
-                        w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                        filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index 527d4571991cd5ba56b5505f15bf74620ec3bcc2..410682271f5872f29eb53736db964568e2a5cb58 100644
--- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -222,11 +222,11 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
     LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
     XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
     src_tmp += (7 * src_stride);
-    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
-               src32_r, src54_r, src21_r);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
     ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
-               src32_l, src54_l, src21_l);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
     ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
     for (loop_cnt = (height >> 2); loop_cnt--;) {
@@ -344,8 +344,8 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
              src32_r, src43_r);
   ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
              src76_r, src87_r);
-  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
-             src87_r, src76_r, src2110, src4332, src6554, src8776);
+  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+             src76_r, src2110, src4332, src6554, src8776);
   DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
               tmp0, tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@@ -407,10 +407,10 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
     LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
     src += (8 * src_stride);
 
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
-               vec2, vec3);
-    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
-               vec6, vec7);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+               vec7);
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
@@ -629,8 +629,8 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -643,67 +643,55 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
   if (((const int32_t *)filter_y)[0] == 0) {
     switch (w) {
       case 4:
-        common_vt_2t_4w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             &filt_ver[3], h);
         break;
       case 8:
-        common_vt_2t_8w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             &filt_ver[3], h);
         break;
       case 16:
-        common_vt_2t_16w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              &filt_ver[3], h);
         break;
       case 32:
-        common_vt_2t_32w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              &filt_ver[3], h);
         break;
       case 64:
-        common_vt_2t_64w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   } else {
     switch (w) {
       case 4:
-        common_vt_8t_4w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             filt_ver, h);
         break;
       case 8:
-        common_vt_8t_8w_msa(src, (int32_t)src_stride,
-                            dst, (int32_t)dst_stride,
+        common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                             filt_ver, h);
         break;
       case 16:
-        common_vt_8t_16w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              filt_ver, h);
         break;
       case 32:
-        common_vt_8t_32w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              filt_ver, h);
         break;
       case 64:
-        common_vt_8t_64w_msa(src, (int32_t)src_stride,
-                             dst, (int32_t)dst_stride,
+        common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
                              filt_ver, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                             filter_x, x_step_q4, filter_y, y_step_q4,
-                             w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
         break;
     }
   }
diff --git a/vpx_dsp/mips/vpx_convolve_avg_msa.c b/vpx_dsp/mips/vpx_convolve_avg_msa.c
index 4c3d978031a57e008e5903e471e9ebd86b227035..45399bad852c552722f969c9c92a8c18995227b7 100644
--- a/vpx_dsp/mips/vpx_convolve_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -10,8 +10,8 @@
 
 #include "vpx_dsp/mips/macros_msa.h"
 
-static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
-                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
   int32_t cnt;
   uint32_t out0, out1, out2, out3;
   v16u8 src0, src1, src2, src3;
@@ -24,8 +24,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
 
       LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
-      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
-                  dst0, dst1, dst2, dst3);
+      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                  dst2, dst3);
 
       out0 = __msa_copy_u_w((v4i32)dst0, 0);
       out1 = __msa_copy_u_w((v4i32)dst1, 0);
@@ -53,8 +53,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
   }
 }
 
-static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
-                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
   int32_t cnt;
   uint64_t out0, out1, out2, out3;
   v16u8 src0, src1, src2, src3;
@@ -65,8 +65,8 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
     src += (4 * src_stride);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
-                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
 
     out0 = __msa_copy_u_d((v2i64)dst0, 0);
     out1 = __msa_copy_u_d((v2i64)dst1, 0);
@@ -88,10 +88,10 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
     src += (8 * src_stride);
     LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
-                dst0, dst1, dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
-                dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+                dst6, dst7);
     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
     dst += (8 * dst_stride);
   }
@@ -120,14 +120,14 @@ static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
     LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
     dst_dup += (4 * dst_stride);
 
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
-                dst0, dst1, dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
-                dst4, dst5, dst6, dst7);
-    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
-                dst8, dst9, dst10, dst11);
-    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
-                dst12, dst13, dst14, dst15);
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+                dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+                dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+                dst13, dst14, dst15);
 
     ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
     ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
@@ -166,14 +166,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
     LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
     dst_dup += dst_stride;
 
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
-                dst0, dst1, dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
-                dst4, dst5, dst6, dst7);
-    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
-                dst8, dst9, dst10, dst11);
-    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
-                dst12, dst13, dst14, dst15);
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
+                dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
+                dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
+                dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
+                dst13, dst14, dst15);
 
     ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
     dst += dst_stride;
diff --git a/vpx_dsp/mips/vpx_convolve_copy_msa.c b/vpx_dsp/mips/vpx_convolve_copy_msa.c
index ba4012281e84ae9402cff599fabc65b65f81de18..c3d87a4ab8112a3ceec5d64cd7356c2bfdec2dc7 100644
--- a/vpx_dsp/mips/vpx_convolve_copy_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -105,12 +105,12 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
     dst_tmp = dst;
 
     for (loop_cnt = (height >> 3); loop_cnt--;) {
-      LD_UB8(src_tmp, src_stride,
-             src0, src1, src2, src3, src4, src5, src6, src7);
+      LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
+             src7);
       src_tmp += (8 * src_stride);
 
-      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
-             dst_tmp, dst_stride);
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
+             dst_stride);
       dst_tmp += (8 * dst_stride);
     }
 
diff --git a/vpx_dsp/mips/vpx_convolve_msa.h b/vpx_dsp/mips/vpx_convolve_msa.h
index e0013983ae6ae4c0635819753def290e94917dac..198c21ed20a5d9624dd91aa702e49e76698fcdf3 100644
--- a/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/vpx_dsp/mips/vpx_convolve_msa.h
@@ -16,104 +16,109 @@
 
 extern const uint8_t mc_filt_mask_arr[16 * 3];
 
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,         \
-                            filt0, filt1, filt2, filt3) ({  \
-  v8i16 tmp0, tmp1;                                         \
-                                                            \
-  tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);         \
-  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);  \
-  tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);         \
-  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);  \
-  tmp0 = __msa_adds_s_h(tmp0, tmp1);                        \
-                                                            \
-  tmp0;                                                     \
-})
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
+                            filt3)                                       \
+  ({                                                                     \
+    v8i16 tmp0, tmp1;                                                    \
+                                                                         \
+    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);                    \
+    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);             \
+    tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);                    \
+    tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);             \
+    tmp0 = __msa_adds_s_h(tmp0, tmp1);                                   \
+                                                                         \
+    tmp0;                                                                \
+  })
 
-#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,        \
-                        filt_h0, filt_h1, filt_h2, filt_h3) ({         \
-  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
-  v8i16 hz_out_m;                                                      \
-                                                                       \
-  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
-             vec0_m, vec1_m, vec2_m, vec3_m);                          \
-  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
-                                 filt_h0, filt_h1, filt_h2, filt_h3);  \
-                                                                       \
-  hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                     \
-  hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
-                                                                       \
-  hz_out_m;                                                            \
-})
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0,       \
+                        filt_h1, filt_h2, filt_h3)                             \
+  ({                                                                           \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
+    v8i16 hz_out_m;                                                            \
+                                                                               \
+    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
+               vec3_m);                                                        \
+    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0,    \
+                                   filt_h1, filt_h2, filt_h3);                 \
+                                                                               \
+    hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                           \
+    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                     \
+                                                                               \
+    hz_out_m;                                                                  \
+  })
 
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,               \
-                                   mask0, mask1, mask2, mask3,           \
-                                   filt0, filt1, filt2, filt3,           \
-                                   out0, out1) {                         \
-  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
-  v8i16 res0_m, res1_m, res2_m, res3_m;                                  \
-                                                                         \
-  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
-  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);             \
-  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
-  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);            \
-  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
-  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);             \
-  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);      \
-  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);            \
-  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);               \
-}
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
+                                   mask2, mask3, filt0, filt1, filt2, filt3, \
+                                   out0, out1)                               \
+  {                                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                    \
+                                                                             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);        \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);               \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);        \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);        \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);               \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);        \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);              \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                 \
+  }
 
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                  \
-                                   mask0, mask1, mask2, mask3,              \
-                                   filt0, filt1, filt2, filt3,              \
-                                   out0, out1, out2, out3) {                \
-  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
-  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
-                                                                            \
-  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
-  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
-  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
-              res0_m, res1_m, res2_m, res3_m);                              \
-  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
-  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
-  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
-              res4_m, res5_m, res6_m, res7_m);                              \
-  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
-  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
-  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
-               res0_m, res1_m, res2_m, res3_m);                             \
-  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
-  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
-  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
-               res4_m, res5_m, res6_m, res7_m);                             \
-  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
-              res7_m, out0, out1, out2, out3);                              \
-}
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
+                                   mask2, mask3, filt0, filt1, filt2, filt3, \
+                                   out0, out1, out2, out3)                   \
+  {                                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;    \
+                                                                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
+                res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);        \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
+                res4_m, res5_m, res6_m, res7_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);        \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+                 res0_m, res1_m, res2_m, res3_m);                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);        \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+                 res4_m, res5_m, res6_m, res7_m);                            \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,      \
+                res7_m, out0, out1, out2, out3);                             \
+  }
 
-#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) {  \
-  v16u8 tmp_m;                                          \
-                                                        \
-  tmp_m = PCKEV_XORI128_UB(in1, in0);                   \
-  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
-  ST_UB(tmp_m, (pdst));                                 \
-}
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
+  {                                                  \
+    v16u8 tmp_m;                                     \
+                                                     \
+    tmp_m = PCKEV_XORI128_UB(in1, in0);              \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);       \
+    ST_UB(tmp_m, (pdst));                            \
+  }
 
-#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) {           \
-  v16u8 tmp_m;                                           \
-                                                         \
-  tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
-  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);             \
-  ST_UB(tmp_m, (pdst));                                  \
-}
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)              \
+  {                                                       \
+    v16u8 tmp_m;                                          \
+                                                          \
+    tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+    tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
+    ST_UB(tmp_m, (pdst));                                 \
+  }
 
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
-                           pdst, stride) {                              \
-  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
-  uint8_t *pdst_m = (uint8_t *)(pdst);                                  \
-                                                                        \
-  PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                      \
-  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                  \
-  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \
-  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \
-}
-#endif  /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
+                           stride)                                           \
+  {                                                                          \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                     \
+                                                                             \
+    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
+    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                \
+  }
+#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/vpx_dsp/prob.c b/vpx_dsp/prob.c
index 639d24dd2f0158c1f119df9c2f50cf4f7b3bf5fc..819e95062e9fea6d4791d283ce3138fd9235ddcb 100644
--- a/vpx_dsp/prob.c
+++ b/vpx_dsp/prob.c
@@ -11,22 +11,16 @@
 #include "./prob.h"
 
 const uint8_t vpx_norm[256] = {
-  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
 static unsigned int tree_merge_probs_impl(unsigned int i,
@@ -35,13 +29,13 @@ static unsigned int tree_merge_probs_impl(unsigned int i,
                                           const unsigned int *counts,
                                           vpx_prob *probs) {
   const int l = tree[i];
-  const unsigned int left_count = (l <= 0)
-                 ? counts[-l]
-                 : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
+  const unsigned int left_count =
+      (l <= 0) ? counts[-l]
+               : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
   const int r = tree[i + 1];
-  const unsigned int right_count = (r <= 0)
-                 ? counts[-r]
-                 : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
+  const unsigned int right_count =
+      (r <= 0) ? counts[-r]
+               : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
   const unsigned int ct[2] = { left_count, right_count };
   probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
   return left_count + right_count;
diff --git a/vpx_dsp/prob.h b/vpx_dsp/prob.h
index c3cb103ffb5e8ca77f234eddbe01f3d304631176..148116ed08994e5c0db84cd47f7a8acc51ea0077 100644
--- a/vpx_dsp/prob.h
+++ b/vpx_dsp/prob.h
@@ -24,11 +24,11 @@ typedef uint8_t vpx_prob;
 
 #define MAX_PROB 255
 
-#define vpx_prob_half ((vpx_prob) 128)
+#define vpx_prob_half ((vpx_prob)128)
 
 typedef int8_t vpx_tree_index;
 
-#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
 
 #define vpx_complement(x) (255 - x)
 
@@ -60,8 +60,7 @@ static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
-static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
-                                   const unsigned int ct[2],
+static INLINE vpx_prob merge_probs(vpx_prob pre_prob, const unsigned int ct[2],
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
   const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
@@ -72,7 +71,7 @@ static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
 
 // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
 static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
-  0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+  0,  6,  12, 19, 25, 32,  38,  44,  51,  57, 64,
   70, 76, 83, 89, 96, 102, 108, 115, 121, 128
 };
 
@@ -93,7 +92,6 @@ static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
 void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
                           const unsigned int *counts, vpx_prob *probs);
 
-
 DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index 5bf78627114ba36e0c2884d418fd5fb693157cc3..47afd4388abfba8d05e9295c1ee6ffd9d6219688 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -14,7 +14,6 @@
 #include "vpx_dsp/psnr.h"
 #include "vpx_scale/yv12config.h"
 
-
 double vpx_sse_to_psnr(double samples, double peak, double sse) {
   if (sse > 0.0) {
     const double psnr = 10.0 * log10(samples * peak * peak / sse);
@@ -27,9 +26,9 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
 * and highbd_8_variance(). It should not.
 */
-static void encoder_variance(const uint8_t *a, int  a_stride,
-  const uint8_t *b, int  b_stride,
-  int  w, int  h, unsigned int *sse, int *sum) {
+static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, unsigned int *sse,
+                             int *sum) {
   int i, j;
 
   *sum = 0;
@@ -48,10 +47,9 @@ static void encoder_variance(const uint8_t *a, int  a_stride,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
-  const uint8_t *b8, int  b_stride,
-  int w, int h, uint64_t *sse,
-  int64_t *sum) {
+static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride, int w,
+                                      int h, uint64_t *sse, int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -70,22 +68,20 @@ static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
   }
 }
 
-static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
-  const uint8_t *b8, int  b_stride,
-  int w, int h,
-  unsigned int *sse, int *sum) {
+static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride, int w,
+                                      int h, unsigned int *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
-    &sse_long, &sum_long);
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
+                            &sum_long);
   *sse = (unsigned int)sse_long;
   *sum = (int)sum_long;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static int64_t get_sse(const uint8_t *a, int a_stride,
-  const uint8_t *b, int b_stride,
-  int width, int height) {
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
   const int dw = width % 16;
   const int dh = height % 16;
   int64_t total_sse = 0;
@@ -94,15 +90,15 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
   int x, y;
 
   if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-      dw, height, &sse, &sum);
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
+                     height, &sse, &sum);
     total_sse += sse;
   }
 
   if (dh > 0) {
     encoder_variance(&a[(height - dh) * a_stride], a_stride,
-      &b[(height - dh) * b_stride], b_stride,
-      width - dw, dh, &sse, &sum);
+                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
+                     &sse, &sum);
     total_sse += sse;
   }
 
@@ -126,9 +122,8 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride,
-                                    int width, int height,
-                                    unsigned int input_shift) {
+                                    const uint8_t *b8, int b_stride, int width,
+                                    int height, unsigned int input_shift) {
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
   int64_t total_sse = 0;
@@ -145,9 +140,8 @@ static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
   return total_sse;
 }
 
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              int width, int height) {
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
   int64_t total_sse = 0;
   int x, y;
   const int dw = width % 16;
@@ -155,15 +149,14 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
   unsigned int sse = 0;
   int sum = 0;
   if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride,
-      &b[width - dw], b_stride,
-      dw, height, &sse, &sum);
+    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
+                              b_stride, dw, height, &sse, &sum);
     total_sse += sse;
   }
   if (dh > 0) {
     encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-      &b[(height - dh) * b_stride], b_stride,
-      width - dw, dh, &sse, &sum);
+                              &b[(height - dh) * b_stride], b_stride,
+                              width - dw, dh, &sse, &sum);
     total_sse += sse;
   }
   for (y = 0; y < height / 16; ++y) {
@@ -182,38 +175,35 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-
 int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
-  const YV12_BUFFER_CONFIG *b) {
+                      const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
 
   return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-    a->y_crop_width, a->y_crop_height);
+                 a->y_crop_width, a->y_crop_height);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-  const YV12_BUFFER_CONFIG *b) {
+                             const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
   assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
   assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
 
   return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-    a->y_crop_width, a->y_crop_height);
+                        a->y_crop_width, a->y_crop_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b,
-                          PSNR_STATS *psnr, uint32_t bit_depth,
-                          uint32_t in_bit_depth) {
-  const int widths[3] =
-  { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
-  const int heights[3] =
-  { a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          uint32_t bit_depth, uint32_t in_bit_depth) {
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
   const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
   const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
   const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
@@ -231,17 +221,14 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
     uint64_t sse;
     if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
       if (input_shift) {
-        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
-          b_planes[i], b_strides[i], w, h,
-          input_shift);
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i],
+                                   b_strides[i], w, h, input_shift);
       } else {
-        sse = highbd_get_sse(a_planes[i], a_strides[i],
-          b_planes[i], b_strides[i], w, h);
+        sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i],
+                             b_strides[i], w, h);
       }
     } else {
-      sse = get_sse(a_planes[i], a_strides[i],
-        b_planes[i], b_strides[i],
-        w, h);
+      sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
     }
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
@@ -253,8 +240,8 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-    (double)total_sse);
+  psnr->psnr[0] =
+      vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
 }
 
 #endif  // !CONFIG_VP9_HIGHBITDEPTH
@@ -262,10 +249,9 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
 void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr) {
   static const double peak = 255.0;
-  const int widths[3] = {
-    a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
-  const int heights[3] = {
-    a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
   const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
   const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
   const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
@@ -278,9 +264,8 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
     const int w = widths[i];
     const int h = heights[i];
     const uint32_t samples = w * h;
-    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
-      b_planes[i], b_strides[i],
-      w, h);
+    const uint64_t sse =
+        get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
     psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
@@ -291,6 +276,6 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-    (double)total_sse);
+  psnr->psnr[0] =
+      vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
 }
diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
index e25b4504af84445dd3d2fb55ca35041c71f0deeb..f321131d0b9f740b9d1b85396c5828ff57d8252b 100644
--- a/vpx_dsp/psnr.h
+++ b/vpx_dsp/psnr.h
@@ -11,7 +11,6 @@
 #ifndef VPX_DSP_PSNR_H_
 #define VPX_DSP_PSNR_H_
 
-
 #include "vpx_scale/yv12config.h"
 
 #define MAX_PSNR 100.0
@@ -37,25 +36,20 @@ typedef struct {
 * \param[in]    sse           Sum of squared errors
 */
 double vpx_sse_to_psnr(double samples, double peak, double sse);
-int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b);
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
                              const YV12_BUFFER_CONFIG *b);
 void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b,
-                      PSNR_STATS *psnr,
-                      unsigned int bit_depth,
-                      unsigned int in_bit_depth);
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          unsigned int bit_depth, unsigned int in_bit_depth);
 #endif
-void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a,
-               const YV12_BUFFER_CONFIG *b,
-               PSNR_STATS *psnr);
+void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr);
 
 double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest,
-                   double *phvs_y, double *phvs_u,
-                   double *phvs_v, uint32_t bd, uint32_t in_bd);
+                   const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+                   double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c
index 3708cc3c859e379e2fe97cfc54e648334510a6d7..b3910152c472e5264c711c51e981384f2fef94f6 100644
--- a/vpx_dsp/psnrhvs.c
+++ b/vpx_dsp/psnrhvs.c
@@ -22,28 +22,28 @@
 #include "vpx_dsp/psnr.h"
 
 #if !defined(M_PI)
-# define M_PI (3.141592653589793238462643)
+#define M_PI (3.141592653589793238462643)
 #endif
 #include <string.h>
 
 static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                            int xstride) {
   int i, j;
-  (void) xstride;
+  (void)xstride;
   vpx_fdct8x8(x, y, ystride);
   for (i = 0; i < 8; i++)
-    for (j = 0; j< 8; j++)
-      *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3;
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 #if CONFIG_VP9_HIGHBITDEPTH
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
-                           int xstride) {
+                               int xstride) {
   int i, j;
-  (void) xstride;
+  (void)xstride;
   vpx_highbd_fdct8x8(x, y, ystride);
   for (i = 0; i < 8; i++)
-    for (j = 0; j< 8; j++)
-      *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3;
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 #endif
 
@@ -51,56 +51,59 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
  * transparency. This is not the JPEG based matrix from the paper,
  this one gives a slightly higher MOS agreement.*/
 static const double csf_y[8][8] = {
-    {1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
-     0.678296995242, 0.466224900598, 0.3265091542},
-    {2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
-     0.868920337363, 0.61280991668, 0.436405793551},
-    {2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
-     0.670882927016, 0.501731932449, 0.372504254596},
-    {1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575,
-     0.605636379554, 0.48309405692, 0.380429446972, 0.295774038565},
-    {1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554,
-     0.448996256676, 0.352889268808, 0.283006984131, 0.226951348204},
-    {0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
-     0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321},
-    {0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
-     0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001},
-    {0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
-     0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276}};
+  { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+    0.678296995242, 0.466224900598, 0.3265091542 },
+  { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+    0.868920337363, 0.61280991668, 0.436405793551 },
+  { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+    0.670882927016, 0.501731932449, 0.372504254596 },
+  { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+    0.48309405692, 0.380429446972, 0.295774038565 },
+  { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+    0.352889268808, 0.283006984131, 0.226951348204 },
+  { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+    0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+  { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+    0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+  { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+    0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
 static const double csf_cb420[8][8] = {
-    {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
-     0.898018824055, 0.74725392039, 0.615105596242},
-    {2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
-     1.17428548929, 0.996404342439, 0.830890433625},
-    {1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
-     0.960060382087, 0.849823426169, 0.731221236837},
-    {1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629,
-     0.801821139099, 0.751437590932, 0.685398513368, 0.608694761374},
-    {1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099,
-     0.676555426187, 0.605503172737, 0.55002013668, 0.495804539034},
-    {0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
-     0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965},
-    {0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
-     0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733},
-    {0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
-     0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237}};
+  { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+    0.898018824055, 0.74725392039, 0.615105596242 },
+  { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+    1.17428548929, 0.996404342439, 0.830890433625 },
+  { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+    0.960060382087, 0.849823426169, 0.731221236837 },
+  { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+    0.751437590932, 0.685398513368, 0.608694761374 },
+  { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+    0.605503172737, 0.55002013668, 0.495804539034 },
+  { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+    0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+  { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+    0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+  { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+    0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
 static const double csf_cr420[8][8] = {
-    {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
-     0.867069376285, 0.721500455585, 0.593906509971},
-    {2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
-     1.13381474809, 0.962064122248, 0.802254508198},
-    {1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
-     0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706},
-    {1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
-     0.725539939514, 0.661776842059, 0.587716619023},
-    {1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195,
-     0.653238524286, 0.584635025748, 0.531064164893, 0.478717061273},
-    {0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
-     0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543},
-    {0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
-     0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063},
-    {0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
-     0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658}};
+  { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+    0.867069376285, 0.721500455585, 0.593906509971 },
+  { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+    1.13381474809, 0.962064122248, 0.802254508198 },
+  { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+    0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+  { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+    0.725539939514, 0.661776842059, 0.587716619023 },
+  { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+    0.584635025748, 0.531064164893, 0.478717061273 },
+  { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+    0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+  { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+    0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+  { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+    0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
 
 static double convert_score_db(double _score, double _weight, int bit_depth) {
   int16_t pix_max = 255;
@@ -110,16 +113,14 @@ static double convert_score_db(double _score, double _weight, int bit_depth) {
   else if (bit_depth == 12)
     pix_max = 4095;
 
-  if (_weight * _score < pix_max * pix_max * 1e-10)
-    return MAX_PSNR;
+  if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
   return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
 }
 
 static double calc_psnrhvs(const unsigned char *src, int _systride,
-                               const unsigned char *dst, int _dystride,
-                               double _par, int _w, int _h, int _step,
-                               const double _csf[8][8], uint32_t bit_depth,
-                               uint32_t _shift) {
+                           const unsigned char *dst, int _dystride, double _par,
+                           int _w, int _h, int _step, const double _csf[8][8],
+                           uint32_t bit_depth, uint32_t _shift) {
   double ret;
   const uint8_t *_src8 = src;
   const uint8_t *_dst8 = dst;
@@ -131,7 +132,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
   int pixels;
   int x;
   int y;
-  (void) _par;
+  (void)_par;
   ret = pixels = 0;
 
   /*In the PSNR-HVS-M paper[1] the authors describe the construction of
@@ -152,8 +153,8 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
    Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
   for (x = 0; x < 8; x++)
     for (y = 0; y < 8; y++)
-      mask[x][y] = (_csf[x][y] * 0.3885746225901003)
-          * (_csf[x][y] * 0.3885746225901003);
+      mask[x][y] =
+          (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
   for (y = 0; y < _h - 7; y += _step) {
     for (x = 0; x < _w - 7; x += _step) {
       int i;
@@ -188,27 +189,23 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
       }
       s_gmean /= 64.f;
       d_gmean /= 64.f;
-      for (i = 0; i < 4; i++)
-        s_means[i] /= 16.f;
-      for (i = 0; i < 4; i++)
-        d_means[i] /= 16.f;
+      for (i = 0; i < 4; i++) s_means[i] /= 16.f;
+      for (i = 0; i < 4; i++) d_means[i] /= 16.f;
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
           int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
           s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
           d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
-          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub])
-              * (dct_s[i * 8 + j] - s_means[sub]);
-          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub])
-              * (dct_d[i * 8 + j] - d_means[sub]);
+          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
+                         (dct_s[i * 8 + j] - s_means[sub]);
+          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
+                         (dct_d[i * 8 + j] - d_means[sub]);
         }
       }
       s_gvar *= 1 / 63.f * 64;
       d_gvar *= 1 / 63.f * 64;
-      for (i = 0; i < 4; i++)
-        s_vars[i] *= 1 / 15.f * 16;
-      for (i = 0; i < 4; i++)
-        d_vars[i] *= 1 / 15.f * 16;
+      for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
+      for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
       if (s_gvar > 0)
         s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
       if (d_gvar > 0)
@@ -231,8 +228,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
           d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
       s_mask = sqrt(s_mask * s_gvar) / 32.f;
       d_mask = sqrt(d_mask * d_gvar) / 32.f;
-      if (d_mask > s_mask)
-        s_mask = d_mask;
+      if (d_mask > s_mask) s_mask = d_mask;
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
           double err;
@@ -245,16 +241,15 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
       }
     }
   }
-  if (pixels <=0)
-      return 0;
+  if (pixels <= 0) return 0;
   ret /= pixels;
   return ret;
 }
 
 double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src,
                    const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs,
-                   double *u_psnrhvs, double *v_psnrhvs,
-                   uint32_t bd, uint32_t in_bd) {
+                   double *u_psnrhvs, double *v_psnrhvs, uint32_t bd,
+                   uint32_t in_bd) {
   double psnrhvs;
   const double par = 1.0;
   const int step = 7;
@@ -268,17 +263,13 @@ double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src,
 
   *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dest->y_buffer,
                             dest->y_stride, par, src->y_crop_width,
-                            src->y_crop_height, step, csf_y, bd,
-                            bd_shift);
+                            src->y_crop_height, step, csf_y, bd, bd_shift);
   *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dest->u_buffer,
                             dest->uv_stride, par, src->uv_crop_width,
-                            src->uv_crop_height, step, csf_cb420, bd,
-                            bd_shift);
+                            src->uv_crop_height, step, csf_cb420, bd, bd_shift);
   *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dest->v_buffer,
                             dest->uv_stride, par, src->uv_crop_width,
-                            src->uv_crop_height, step, csf_cr420, bd,
-                            bd_shift);
+                            src->uv_crop_height, step, csf_cr420, bd, bd_shift);
   psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
   return convert_score_db(psnrhvs, 1.0, in_bd);
 }
-
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 80fcd66b052b07680b88a688ba4be98c4676a0f7..3c7f9832f7adc09fe6cd2fc3d6c273cdd5c67618 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -12,8 +12,7 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr) {
@@ -29,20 +28,19 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr,
   if (!skip_block) {
     tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp)
-      eob = 0;
+    if (tmp) eob = 0;
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr) {
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -56,8 +54,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
     const int abs_qcoeff = (int)((tmp * quant) >> 16);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
-    if (abs_qcoeff)
-      eob = 0;
+    if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
 }
@@ -81,19 +78,16 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
     tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
                 INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp)
-      eob = 0;
+    if (tmp) eob = 0;
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant,
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant_ptr,
@@ -112,24 +106,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
     const int abs_qcoeff = (int)((tmp * quant) >> 15);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
-    if (abs_qcoeff)
-      eob = 0;
+    if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
 }
 #endif
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
+                      int skip_block, const int16_t *zbin_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -158,12 +150,12 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       if (abs_coeff >= zbins[rc != 0]) {
         int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
         tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+               quant_shift_ptr[rc != 0]) >>
+              16;  // quantization
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
-        if (tmp)
-          eob = i;
+        if (tmp) eob = i;
       }
     }
   }
@@ -176,12 +168,11 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *round_ptr, const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const int16_t *iscan) {
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -214,8 +205,7 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
             (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
         qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff)
-          eob = i;
+        if (abs_qcoeff) eob = i;
       }
     }
   }
@@ -224,17 +214,15 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 #endif
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
                             const int16_t *quant_shift_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
 
   int idx = 0;
   int idx_arr[1024];
@@ -267,33 +255,28 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
       abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
       tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-               quant_shift_ptr[rc != 0]) >> 15;
+             quant_shift_ptr[rc != 0]) >>
+            15;
 
       qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
 
-      if (tmp)
-        eob = idx_arr[i];
+      if (tmp) eob = idx_arr[i];
     }
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, int skip_block,
-                                   const int16_t *zbin_ptr,
-                                   const int16_t *round_ptr,
-                                   const int16_t *quant_ptr,
-                                   const int16_t *quant_shift_ptr,
-                                   tran_low_t *qcoeff_ptr,
-                                   tran_low_t *dqcoeff_ptr,
-                                   const int16_t *dequant_ptr,
-                                   uint16_t *eob_ptr,
-                                   const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+void vpx_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
 
   int idx = 0;
   int idx_arr[1024];
@@ -322,15 +305,14 @@ void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 = abs_coeff
-                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
       const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
       const uint32_t abs_qcoeff =
           (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff)
-        eob = idx_arr[i];
+      if (abs_qcoeff) eob = idx_arr[i];
     }
   }
   *eob_ptr = eob + 1;
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 89ec5979247b86cc9a9d577782fa039da1dda24a..e13284546333cd2b38a30bd9be0a5f2833ed3ec4 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -18,8 +18,7 @@
 extern "C" {
 #endif
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant_ptr,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr);
@@ -29,19 +28,17 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t dequant_ptr, uint16_t *eob_ptr);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   const int16_t *round_ptr,
                                   const int16_t quant_ptr,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr);
+                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
 #endif
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index e49148d32e925a1e29e60e4deee14885377a7ad7..7306e8fb0ed67b489fb7b8efc6e02f1571f7f3a5 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -17,15 +17,13 @@
 #include "vpx_ports/mem.h"
 
 /* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride,
-                               const uint8_t *b, int b_stride,
-                               int width, int height) {
+static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
   int y, x;
   unsigned int sad = 0;
 
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
 
     a += a_stride;
     b += b_stride;
@@ -33,40 +31,43 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
   return sad;
 }
 
-#define sadMxN(m, n) \
-unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                  const uint8_t *ref, int ref_stride) { \
-  return sad(src, src_stride, ref, ref_stride, m, n); \
-} \
-unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                      const uint8_t *ref, int ref_stride, \
-                                      const uint8_t *second_pred) { \
-  uint8_t comp_pred[m * n]; \
-  vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
-  return sad(src, src_stride, comp_pred, m, m, n); \
-}
+#define sadMxN(m, n)                                                        \
+  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
+                                    const uint8_t *ref, int ref_stride) {   \
+    return sad(src, src_stride, ref, ref_stride, m, n);                     \
+  }                                                                         \
+  unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+                                        const uint8_t *ref, int ref_stride, \
+                                        const uint8_t *second_pred) {       \
+    uint8_t comp_pred[m * n];                                               \
+    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
+    return sad(src, src_stride, comp_pred, m, m, n);                        \
+  }
 
 // depending on call sites, pass **ref_array to avoid & in subsequent call and
 // de-dup with 4D below.
-#define sadMxNxK(m, n, k) \
-void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
-                                const uint8_t *ref_array, int ref_stride, \
-                                uint32_t *sad_array) { \
-  int i; \
-  for (i = 0; i < k; ++i) \
-    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
-}
+#define sadMxNxK(m, n, k)                                                   \
+  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
+                                  const uint8_t *ref_array, int ref_stride, \
+                                  uint32_t *sad_array) {                    \
+    int i;                                                                  \
+    for (i = 0; i < k; ++i)                                                 \
+      sad_array[i] =                                                        \
+          vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+  }
 
 // This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n) \
-void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
-                             const uint8_t *const ref_array[], int ref_stride, \
-                             uint32_t *sad_array) { \
-  int i; \
-  for (i = 0; i < 4; ++i) \
-    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
-}
+#define sadMxNx4D(m, n)                                                    \
+  void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[],           \
+                               int ref_stride, uint32_t *sad_array) {      \
+    int i;                                                                 \
+    for (i = 0; i < 4; ++i)                                                \
+      sad_array[i] =                                                       \
+          vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+  }
 
+/* clang-format off */
 #if CONFIG_VP10 && CONFIG_EXT_PARTITION
 // 128x128
 sadMxN(128, 128)
@@ -150,18 +151,18 @@ sadMxN(4, 4)
 sadMxNxK(4, 4, 3)
 sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
+/* clang-format on */
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride,
-                                      int width, int height) {
+        static INLINE
+    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
 
     a += a_stride;
     b += b_stride;
@@ -176,8 +177,7 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
   unsigned int sad = 0;
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
 
     a += a_stride;
     b += b_stride;
@@ -185,43 +185,43 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
   return sad;
 }
 
-#define highbd_sadMxN(m, n) \
-unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride) { \
-  return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
-} \
-unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
-                                             int src_stride, \
-                                             const uint8_t *ref, \
-                                             int ref_stride, \
-                                             const uint8_t *second_pred) { \
-  uint16_t comp_pred[m * n]; \
-  vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
-  return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
-}
+#define highbd_sadMxN(m, n)                                                    \
+  unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  }                                                                            \
+  unsigned int vpx_highbd_sad##m##x##n##_avg_c(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred) {                                            \
+    uint16_t comp_pred[m * n];                                                 \
+    vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
+    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+  }
 
-#define highbd_sadMxNxK(m, n, k) \
-void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref_array, int ref_stride, \
-                                       uint32_t *sad_array) { \
-  int i; \
-  for (i = 0; i < k; ++i) { \
-    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \
-                                               ref_stride); \
-  } \
-}
+#define highbd_sadMxNxK(m, n, k)                                             \
+  void vpx_highbd_sad##m##x##n##x##k##_c(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *ref_array,          \
+      int ref_stride, uint32_t *sad_array) {                                 \
+    int i;                                                                   \
+    for (i = 0; i < k; ++i) {                                                \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
+                                                 &ref_array[i], ref_stride); \
+    }                                                                        \
+  }
 
-#define highbd_sadMxNx4D(m, n) \
-void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
-                                    const uint8_t *const ref_array[], \
-                                    int ref_stride, uint32_t *sad_array) { \
-  int i; \
-  for (i = 0; i < 4; ++i) { \
-    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \
-                                               ref_stride); \
-  } \
-}
+#define highbd_sadMxNx4D(m, n)                                               \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
+                                      const uint8_t *const ref_array[],      \
+                                      int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
+                                                 ref_array[i], ref_stride);  \
+    }                                                                        \
+  }
 
+/* clang-format off */
 #if CONFIG_VP10 && CONFIG_EXT_PARTITION
 // 128x128
 highbd_sadMxN(128, 128)
@@ -305,20 +305,19 @@ highbd_sadMxN(4, 4)
 highbd_sadMxNxK(4, 4, 3)
 highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)
-
+/* clang-format on */
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP10 && CONFIG_EXT_INTER
-static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
-                                      const uint8_t *m, int m_stride,
-                                      int width, int height) {
+            static INLINE
+    unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride, const uint8_t *m, int m_stride,
+                            int width, int height) {
   int y, x;
   unsigned int sad = 0;
 
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += m[x] * abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
 
     a += a_stride;
     b += b_stride;
@@ -329,13 +328,15 @@ static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride,
   return sad;
 }
 
-#define MASKSADMxN(m, n) \
-unsigned int vpx_masked_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         const uint8_t *msk, int msk_stride) { \
-  return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, n); \
-}
+#define MASKSADMxN(m, n)                                                      \
+  unsigned int vpx_masked_sad##m##x##n##_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m,   \
+                      n);                                                     \
+  }
 
+/* clang-format off */
 #if CONFIG_EXT_PARTITION
 MASKSADMxN(128, 128)
 MASKSADMxN(128, 64)
@@ -354,20 +355,21 @@ MASKSADMxN(8, 8)
 MASKSADMxN(8, 4)
 MASKSADMxN(4, 8)
 MASKSADMxN(4, 4)
+/* clang-format on */
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
-                                             const uint8_t *b8, int b_stride,
-                                             const uint8_t *m, int m_stride,
-                                             int width, int height) {
+                    static INLINE
+    unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
+                                   const uint8_t *b8, int b_stride,
+                                   const uint8_t *m, int m_stride, int width,
+                                   int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += m[x] * abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
 
     a += a_stride;
     b += b_stride;
@@ -378,16 +380,13 @@ static INLINE unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
   return sad;
 }
 
-#define HIGHBD_MASKSADMXN(m, n) \
-unsigned int vpx_highbd_masked_sad##m##x##n##_c(const uint8_t *src, \
-                                                int src_stride, \
-                                                const uint8_t *ref, \
-                                                int ref_stride, \
-                                                const uint8_t *msk, \
-                                                int msk_stride) { \
-  return highbd_masked_sad(src, src_stride, ref, ref_stride, \
-                           msk, msk_stride, m, n); \
-}
+#define HIGHBD_MASKSADMXN(m, n)                                               \
+  unsigned int vpx_highbd_masked_sad##m##x##n##_c(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad(src, src_stride, ref, ref_stride, msk,           \
+                             msk_stride, m, n);                               \
+  }
 
 #if CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN(128, 128)
@@ -415,8 +414,7 @@ HIGHBD_MASKSADMXN(4, 4)
 // wsrc: target weighted prediction (has been *4096 to keep precision)
 // mask: 2d weights (scaled by 4096)
 static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
-                                    const int32_t *wsrc,
-                                    const int32_t *mask,
+                                    const int32_t *wsrc, const int32_t *mask,
                                     int width, int height) {
   int y, x;
   unsigned int sad = 0;
@@ -433,13 +431,14 @@ static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
   return sad;
 }
 
-#define OBMCSADMxN(m, n)                                                      \
-unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride,    \
-                                       const int32_t *wsrc,                   \
-                                       const int32_t *mask) {                 \
-  return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                         \
-}
+#define OBMCSADMxN(m, n)                                                     \
+  unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+                                         const int32_t *wsrc,                \
+                                         const int32_t *mask) {              \
+    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
+  }
 
+/* clang-format off */
 #if CONFIG_EXT_PARTITION
 OBMCSADMxN(128, 128)
 OBMCSADMxN(128, 64)
@@ -458,12 +457,13 @@ OBMCSADMxN(8, 8)
 OBMCSADMxN(8, 4)
 OBMCSADMxN(4, 8)
 OBMCSADMxN(4, 4)
+/* clang-format on */
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           int width, int height) {
+                    static INLINE
+    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int width, int height) {
   int y, x;
   unsigned int sad = 0;
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
@@ -480,14 +480,14 @@ static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
   return sad;
 }
 
-#define HIGHBD_OBMCSADMXN(m, n)                                               \
-unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref,             \
-                                              int ref_stride,                 \
-                                              const int32_t *wsrc,            \
-                                              const int32_t *mask) {          \
-  return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n);                  \
-}
+#define HIGHBD_OBMCSADMXN(m, n)                                \
+  unsigned int vpx_highbd_obmc_sad##m##x##n##_c(               \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+      const int32_t *mask) {                                   \
+    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+  }
 
+/* clang-format off */
 #if CONFIG_EXT_PARTITION
 HIGHBD_OBMCSADMXN(128, 128)
 HIGHBD_OBMCSADMXN(128, 64)
@@ -506,5 +506,6 @@ HIGHBD_OBMCSADMXN(8, 8)
 HIGHBD_OBMCSADMXN(8, 4)
 HIGHBD_OBMCSADMXN(4, 8)
 HIGHBD_OBMCSADMXN(4, 4)
+/* clang-format on */
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP10 && CONFIG_OBMC
diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c
index 632e272dc947e6da885db3c2cf365cbe926b3943..7a29bd29f9f1c1509956baaa031e7d3775e22659 100644
--- a/vpx_dsp/ssim.c
+++ b/vpx_dsp/ssim.c
@@ -15,8 +15,8 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 
-void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r,
-                            int rp, uint32_t *sum_s, uint32_t *sum_r,
+void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                            uint32_t *sum_s, uint32_t *sum_r,
                             uint32_t *sum_sq_s, uint32_t *sum_sq_r,
                             uint32_t *sum_sxr) {
   int i, j;
@@ -31,9 +31,8 @@ void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r,
   }
 }
 void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                          uint32_t *sum_s, uint32_t *sum_r,
-                          uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                          uint32_t *sum_sxr) {
+                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
   int i, j;
   for (i = 0; i < 8; i++, s += sp, r += rp) {
     for (j = 0; j < 8; j++) {
@@ -47,9 +46,8 @@ void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp,
-                                 const uint16_t *r, int rp,
-                                 uint32_t *sum_s, uint32_t *sum_r,
+void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
                                  uint32_t *sum_sxr) {
   int i, j;
@@ -65,16 +63,15 @@ void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
-static const int64_t cc1_10 = 428658;  // (64^2*(.01*1023)^2
-static const int64_t cc2_10 = 3857925;  // (64^2*(.03*1023)^2
-static const int64_t cc1_12 = 6868593;  // (64^2*(.01*4095)^2
+static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
 static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
 
-static double similarity(uint32_t sum_s, uint32_t sum_r,
-                         uint32_t sum_sq_s, uint32_t sum_sq_r,
-                         uint32_t sum_sxr, int count,
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
   int64_t ssim_n, ssim_d;
   int64_t c1, c2;
@@ -93,12 +90,12 @@ static double similarity(uint32_t sum_s, uint32_t sum_r,
     assert(0);
   }
 
-  ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
-                                       (int64_t) 2 * sum_s * sum_r + c2);
+  ssim_n = (2 * sum_s * sum_r + c1) *
+           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
 
   ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
            ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2);
+            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
 
   return ssim_n * 1.0 / ssim_d;
 }
@@ -116,12 +113,8 @@ static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                             &sum_sxr);
-  return similarity(sum_s >> shift,
-                    sum_r >> shift,
-                    sum_sq_s >> (2 * shift),
-                    sum_sq_r >> (2 * shift),
-                    sum_sxr >> (2 * shift),
-                    64, bd);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -161,8 +154,8 @@ static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
        i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
     for (j = 0; j <= width - 8; j += 4) {
       double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
-                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
-                                 bd, shift);
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
       ssim_total += v;
       samples++;
     }
@@ -173,22 +166,18 @@ static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest,
-                     double *weight) {
+                     const YV12_BUFFER_CONFIG *dest, double *weight) {
   double a, b, c;
   double ssimv;
 
-  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride,
-                source->y_crop_width, source->y_crop_height);
+  a = vpx_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+                dest->y_stride, source->y_crop_width, source->y_crop_height);
 
-  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
+  b = vpx_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+                dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
 
-  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
+  c = vpx_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+                dest->uv_stride, source->uv_crop_width, source->uv_crop_height);
 
   ssimv = a * .8 + .1 * (b + c);
 
@@ -232,13 +221,13 @@ static double ssimv_similarity(const Ssimv *sv, int64_t n) {
   const int64_t c2 = (cc2 * n * n) >> 12;
 
   const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
-      (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
 
   // Since these variables are unsigned sums, convert to double so
   // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
-      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r
-         - sv->sum_r * sv->sum_r + c2);
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
 
   return l * v;
 }
@@ -267,24 +256,21 @@ static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
 
   // Since these variables are unsigned, sums convert to double so
   // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
-      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
-         n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
 
   return l * v;
 }
 static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
                         int img2_pitch, Ssimv *sv) {
-  vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
-                     &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r,
-                     &sv->sum_sxr);
+  vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+                     &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
 }
 
-double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
-                            uint8_t *img2, int img2_pitch,
-                            int width, int height,
-                            Ssimv *sv2, Metrics *m,
-                            int do_inconsistency) {
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency) {
   double dssim_total = 0;
   double ssim_total = 0;
   double ssim2_total = 0;
@@ -295,10 +281,10 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
   double old_ssim_total = 0;
   vpx_clear_system_state();
   // We can sample points as frequently as we like start with 1 per 4x4.
-  for (i = 0; i < height; i += 4,
-       img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
     for (j = 0; j < width; j += 4, ++c) {
-      Ssimv sv = {0};
+      Ssimv sv = { 0 };
       double ssim;
       double ssim2;
       double dssim;
@@ -384,27 +370,29 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
 
         // This measures how much consistent variance is in two consecutive
         // source frames. 1.0 means they have exactly the same variance.
-        const double variance_term = (2.0 * var_old * var_new + c1) /
+        const double variance_term =
+            (2.0 * var_old * var_new + c1) /
             (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
 
         // This measures how consistent the local mean are between two
         // consecutive frames. 1.0 means they have exactly the same mean.
-        const double mean_term = (2.0 * mean_old * mean_new + c2) /
+        const double mean_term =
+            (2.0 * mean_old * mean_new + c2) /
             (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
 
         // This measures how consistent the ssims of two
         // consecutive frames is. 1.0 means they are exactly the same.
-        double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) /
-                               (ssim_old * ssim_old + ssim_new * ssim_new + c3),
-                               5);
+        double ssim_term =
+            pow((2.0 * ssim_old * ssim_new + c3) /
+                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                5);
 
         double this_inconsistency;
 
         // Floating point math sometimes makes this > 1 by a tiny bit.
         // We want the metric to scale between 0 and 1.0 so we can convert
         // it to an snr scaled value.
-        if (ssim_term > 1)
-          ssim_term = 1;
+        if (ssim_term > 1) ssim_term = 1;
 
         // This converts the consistency metric to an inconsistency metric
         // ( so we can scale it like psnr to something like sum square error.
@@ -432,8 +420,7 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
   ssim2_total *= norm;
   m->ssim2 = ssim2_total;
   m->ssim = ssim_total;
-  if (old_ssim_total == 0)
-    inconsistency_total = 0;
+  if (old_ssim_total == 0) inconsistency_total = 0;
 
   m->ssimc = inconsistency_total;
 
@@ -441,11 +428,10 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
   return inconsistency_total;
 }
 
-
 #if CONFIG_VP9_HIGHBITDEPTH
 double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest,
-                            double *weight, uint32_t bd, uint32_t in_bd) {
+                            const YV12_BUFFER_CONFIG *dest, double *weight,
+                            uint32_t bd, uint32_t in_bd) {
   double a, b, c;
   double ssimv;
   uint32_t shift = 0;
@@ -453,20 +439,17 @@ double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
   assert(bd >= in_bd);
   shift = bd - in_bd;
 
-  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
-                       source->y_stride, dest->y_stride,
-                       source->y_crop_width, source->y_crop_height,
-                       in_bd, shift);
+  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+                       dest->y_stride, source->y_crop_width,
+                       source->y_crop_height, in_bd, shift);
 
-  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
-                       source->uv_stride, dest->uv_stride,
-                       source->uv_crop_width, source->uv_crop_height,
-                       in_bd, shift);
+  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+                       dest->uv_stride, source->uv_crop_width,
+                       source->uv_crop_height, in_bd, shift);
 
-  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
-                       source->uv_stride, dest->uv_stride,
-                       source->uv_crop_width, source->uv_crop_height,
-                       in_bd, shift);
+  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+                       dest->uv_stride, source->uv_crop_width,
+                       source->uv_crop_height, in_bd, shift);
 
   ssimv = a * .8 + .1 * (b + c);
 
diff --git a/vpx_dsp/ssim.h b/vpx_dsp/ssim.h
index d4d6b0d8a93d3a9f51884f3d73e5823b71e9c12e..4f2bb1d556c8cd972fb5ed9f0304fff5983c1c18 100644
--- a/vpx_dsp/ssim.h
+++ b/vpx_dsp/ssim.h
@@ -63,22 +63,20 @@ typedef struct {
 } Metrics;
 
 double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                      int img2_pitch, int width, int height, Ssimv *sv2,
-                      Metrics *m, int do_inconsistency);
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency);
 
 double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest,
-                     double *weight);
+                     const YV12_BUFFER_CONFIG *dest, double *weight);
 
 double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
-                         const YV12_BUFFER_CONFIG *dest,
-                         double *ssim_y, double *ssim_u,
-                         double *ssim_v, uint32_t bd, uint32_t in_bd);
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest,
-                            double *weight,
+                            const YV12_BUFFER_CONFIG *dest, double *weight,
                             uint32_t bd, uint32_t in_bd);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c
index 556e0134f387bfdc49ae3895d4c253e0c774a6a8..95e7071b27e969fff125ae60b2e5b20fe944ccbe 100644
--- a/vpx_dsp/subtract.c
+++ b/vpx_dsp/subtract.c
@@ -16,32 +16,30 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-void vpx_subtract_block_c(int rows, int cols,
-                          int16_t *diff, ptrdiff_t diff_stride,
-                          const uint8_t *src, ptrdiff_t src_stride,
-                          const uint8_t *pred, ptrdiff_t pred_stride) {
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff,
+                          ptrdiff_t diff_stride, const uint8_t *src,
+                          ptrdiff_t src_stride, const uint8_t *pred,
+                          ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++)
-      diff[c] = src[c] - pred[c];
+    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
 
     diff += diff_stride;
     pred += pred_stride;
-    src  += src_stride;
+    src += src_stride;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_subtract_block_c(int rows, int cols,
-                                 int16_t *diff, ptrdiff_t diff_stride,
-                                 const uint8_t *src8, ptrdiff_t src_stride,
-                                 const uint8_t *pred8, ptrdiff_t pred_stride,
-                                 int bd) {
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
+                                 ptrdiff_t diff_stride, const uint8_t *src8,
+                                 ptrdiff_t src_stride, const uint8_t *pred8,
+                                 ptrdiff_t pred_stride, int bd) {
   int r, c;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  (void) bd;
+  (void)bd;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++) {
@@ -50,7 +48,7 @@ void vpx_highbd_subtract_block_c(int rows, int cols,
 
     diff += diff_stride;
     pred += pred_stride;
-    src  += src_stride;
+    src += src_stride;
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c
index c72461cd18aa2ed5555de7f2128ad662fb769c2b..73a90063415221830926d3dc7cc72bc927ce9755 100644
--- a/vpx_dsp/sum_squares.c
+++ b/vpx_dsp/sum_squares.c
@@ -20,9 +20,9 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
   for (r = 0; r < size; r++) {
     for (c = 0; c < size; c++) {
       const int16_t v = src[c];
-      ss += v*v;
+      ss += v * v;
     }
-    src  += src_stride;
+    src += src_stride;
   }
 
   return ss;
@@ -32,7 +32,7 @@ uint64_t vpx_sum_squares_i16_c(const int16_t *src, uint32_t n) {
   uint64_t ss = 0;
   do {
     const int16_t v = *src++;
-    ss += v*v;
+    ss += v * v;
   } while (--n);
 
   return ss;
diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h
index 9b0e9900a88f9b062d3b9e0c3fcefe5471728ff8..3559b91e2c7dd1aef468fb1eb9b37ba1226e999e 100644
--- a/vpx_dsp/txfm_common.h
+++ b/vpx_dsp/txfm_common.h
@@ -15,7 +15,7 @@
 
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
 
 #define UNIT_QUANT_SHIFT 2
 #define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
@@ -25,15 +25,15 @@
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
 //           round(16384 * cos(i*M_PI/64)));
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64  = 16364;
-static const tran_high_t cospi_2_64  = 16305;
-static const tran_high_t cospi_3_64  = 16207;
-static const tran_high_t cospi_4_64  = 16069;
-static const tran_high_t cospi_5_64  = 15893;
-static const tran_high_t cospi_6_64  = 15679;
-static const tran_high_t cospi_7_64  = 15426;
-static const tran_high_t cospi_8_64  = 15137;
-static const tran_high_t cospi_9_64  = 14811;
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
 static const tran_high_t cospi_10_64 = 14449;
 static const tran_high_t cospi_11_64 = 14053;
 static const tran_high_t cospi_12_64 = 13623;
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 3fd80dcf374a89e46b66b52f1a2afab92625e19b..79b6760c0f756a0d7a8de313cec3d87eb7ace69b 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -18,8 +18,8 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/vpx_filter.h"
 
-uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
-                            const uint8_t *b, int  b_stride) {
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride) {
   int distortion = 0;
   int r, c;
 
@@ -49,28 +49,23 @@ uint32_t vpx_get_mb_ss_c(const int16_t *a) {
 uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
                                           const uint8_t *b, int b_stride,
                                           uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
-                                       b, b_stride, sse);
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse);
 }
 
-
 uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
                                           const uint8_t *b, int b_stride,
                                           uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
-                                       b, b_stride, sse);
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse);
 }
 
 uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
                                            const uint8_t *b, int b_stride,
                                            uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
-                                       b, b_stride, sse);
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse);
 }
 
-static void variance(const uint8_t *a, int  a_stride,
-                     const uint8_t *b, int  b_stride,
-                     int  w, int  h, uint32_t *sse, int *sum) {
+static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
   int i, j;
 
   *sum = 0;
@@ -106,9 +101,8 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
-                          (int)a[pixel_step] * filter[1],
-                          FILTER_BITS);
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
 
       ++a;
     }
@@ -133,13 +127,12 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
                                                unsigned int output_height,
                                                unsigned int output_width,
                                                const uint8_t *filter) {
-  unsigned int  i, j;
+  unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
-                          (int)a[pixel_step] * filter[1],
-                          FILTER_BITS);
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
       ++a;
     }
 
@@ -148,82 +141,78 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
   }
 }
 
-#define VAR(W, H) \
-uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                   const uint8_t *b, int b_stride, \
-                                   uint32_t *sse) { \
-  int sum; \
-  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
+#define VAR(W, H)                                                    \
+  uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                     const uint8_t *b, int b_stride, \
+                                     uint32_t *sse) {                \
+    int sum;                                                         \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    return *sse - (((int64_t)sum * sum) / (W * H));                  \
+  }
 
-#define SUBPIX_VAR(W, H) \
-uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                             int xoffset, int  yoffset, \
-                                             const uint8_t *b, int b_stride, \
-                                             uint32_t *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-\
-  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    bilinear_filters_2t[xoffset]); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters_2t[yoffset]); \
-\
-  return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
-}
+#define SUBPIX_VAR(W, H)                                                \
+  uint32_t vpx_sub_pixel_variance##W##x##H##_c(                         \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
+    uint16_t fdata3[(H + 1) * W];                                       \
+    uint8_t temp2[H * W];                                               \
+                                                                        \
+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                      bilinear_filters_2t[xoffset]);    \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                       bilinear_filters_2t[yoffset]);   \
+                                                                        \
+    return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse);       \
+  }
 
-#define SUBPIX_AVG_VAR(W, H) \
-uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int  a_stride, \
-                                                 int xoffset, int  yoffset, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 uint32_t *sse, \
-                                                 const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
-\
-  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    bilinear_filters_2t[xoffset]); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters_2t[yoffset]); \
-\
-  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
-\
-  return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
-}
+#define SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(                     \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
+      const uint8_t *b, int b_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred) {                                     \
+    uint16_t fdata3[(H + 1) * W];                                       \
+    uint8_t temp2[H * W];                                               \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
+                                                                        \
+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                      bilinear_filters_2t[xoffset]);    \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                       bilinear_filters_2t[yoffset]);   \
+                                                                        \
+    vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
+                                                                        \
+    return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
+  }
 
 /* Identical to the variance call except it takes an additional parameter, sum,
  * and returns that value using pass-by-reference instead of returning
  * sse - sum^2 / w*h
  */
-#define GET_VAR(W, H) \
-void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
-                             const uint8_t *b, int b_stride, \
-                             uint32_t *sse, int *sum) { \
-  variance(a, a_stride, b, b_stride, W, H, sse, sum); \
-}
+#define GET_VAR(W, H)                                                         \
+  void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
+                               const uint8_t *b, int b_stride, uint32_t *sse, \
+                               int *sum) {                                    \
+    variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
+  }
 
 /* Identical to the variance call except it does not calculate the
  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
  * variable.
  */
-#define MSE(W, H) \
-uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
-                              const uint8_t *b, int b_stride, \
-                              uint32_t *sse) { \
-  int sum; \
-  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse; \
-}
+#define MSE(W, H)                                               \
+  uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                const uint8_t *b, int b_stride, \
+                                uint32_t *sse) {                \
+    int sum;                                                    \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
+    return *sse;                                                \
+  }
 
 /* All three forms of the variance are available in the same sizes. */
 #define VARIANCES(W, H) \
-    VAR(W, H) \
-    SUBPIX_VAR(W, H) \
-    SUBPIX_AVG_VAR(W, H)
+  VAR(W, H)             \
+  SUBPIX_VAR(W, H)      \
+  SUBPIX_AVG_VAR(W, H)
 
 #if CONFIG_VP10 && CONFIG_EXT_PARTITION
 VARIANCES(128, 128)
@@ -252,9 +241,8 @@ MSE(16, 8)
 MSE(8, 16)
 MSE(8, 8)
 
-void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
-                         int width, int height,
-                         const uint8_t *ref, int ref_stride) {
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                         int height, const uint8_t *ref, int ref_stride) {
   int i, j;
 
   for (i = 0; i < height; ++i) {
@@ -269,42 +257,41 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
 }
 
 // Get pred block from up-sampled reference.
-void vpx_upsampled_pred_c(uint8_t *comp_pred,
-                          int width, int height,
-                          const uint8_t *ref,  int ref_stride) {
-    int i, j, k;
-    int stride = ref_stride << 3;
-
-    for (i = 0; i < height; i++) {
-      for (j = 0, k = 0; j < width; j++, k += 8) {
-        comp_pred[j] = ref[k];
-      }
-      comp_pred += width;
-      ref += stride;
+void vpx_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
+                          const uint8_t *ref, int ref_stride) {
+  int i, j, k;
+  int stride = ref_stride << 3;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0, k = 0; j < width; j++, k += 8) {
+      comp_pred[j] = ref[k];
     }
+    comp_pred += width;
+    ref += stride;
+  }
 }
 
 void vpx_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height,
-                                   const uint8_t *ref, int ref_stride) {
-    int i, j;
-    int stride = ref_stride << 3;
-
-    for (i = 0; i < height; i++) {
-      for (j = 0; j < width; j++) {
-        const int tmp = ref[(j << 3)] + pred[j];
-        comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += stride;
+                                   int width, int height, const uint8_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = ref[(j << 3)] + pred[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
     }
+    comp_pred += width;
+    pred += width;
+    ref += stride;
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_variance64(const uint8_t *a8, int  a_stride,
-                              const uint8_t *b8, int  b_stride,
-                              int w, int h, uint64_t *sse, int64_t *sum) {
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint64_t *sse, int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -323,9 +310,9 @@ static void highbd_variance64(const uint8_t *a8, int  a_stride,
   }
 }
 
-static void highbd_8_variance(const uint8_t *a8, int  a_stride,
-                              const uint8_t *b8, int  b_stride,
-                              int w, int h, uint32_t *sse, int *sum) {
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
@@ -333,9 +320,9 @@ static void highbd_8_variance(const uint8_t *a8, int  a_stride,
   *sum = (int)sum_long;
 }
 
-static void highbd_10_variance(const uint8_t *a8, int  a_stride,
-                               const uint8_t *b8, int  b_stride,
-                               int w, int h, uint32_t *sse, int *sum) {
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
@@ -343,9 +330,9 @@ static void highbd_10_variance(const uint8_t *a8, int  a_stride,
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 }
 
-static void highbd_12_variance(const uint8_t *a8, int  a_stride,
-                               const uint8_t *b8, int  b_stride,
-                               int w, int h, uint32_t *sse, int *sum) {
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
@@ -353,107 +340,91 @@ static void highbd_12_variance(const uint8_t *a8, int  a_stride,
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }
 
-#define HIGHBD_VAR(W, H) \
-uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
-                                            int a_stride, \
-                                            const uint8_t *b, \
-                                            int b_stride, \
-                                            uint32_t *sse) { \
-  int sum; \
-  highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
-                                             int a_stride, \
-                                             const uint8_t *b, \
-                                             int b_stride, \
-                                             uint32_t *sse) { \
-  int sum; \
-  int64_t var; \
-  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
-  return (var >= 0) ? (uint32_t)var : 0; \
-} \
-\
-uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
-                                             int a_stride, \
-                                             const uint8_t *b, \
-                                             int b_stride, \
-                                             uint32_t *sse) { \
-  int sum; \
-  int64_t var; \
-  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
-  return (var >= 0) ? (uint32_t)var : 0; \
-}
+#define HIGHBD_VAR(W, H)                                                       \
+  uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
+                                              const uint8_t *b, int b_stride,  \
+                                              uint32_t *sse) {                 \
+    int sum;                                                                   \
+    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
+    return *sse - (((int64_t)sum * sum) / (W * H));                            \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
 
-#define HIGHBD_GET_VAR(S) \
-void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                      const uint8_t *ref, int ref_stride, \
-                                      uint32_t *sse, int *sum) { \
-  highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       uint32_t *sse, int *sum) { \
-  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       uint32_t *sse, int *sum) { \
-  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-}
+#define HIGHBD_GET_VAR(S)                                                    \
+  void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
+                                        const uint8_t *ref, int ref_stride,  \
+                                        uint32_t *sse, int *sum) {           \
+    highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
+  }                                                                          \
+                                                                             \
+  void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         uint32_t *sse, int *sum) {          \
+    highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+  }                                                                          \
+                                                                             \
+  void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         uint32_t *sse, int *sum) {          \
+    highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+  }
 
-#define HIGHBD_MSE(W, H) \
-uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
-                                       int src_stride, \
-                                       const uint8_t *ref, \
-                                       int ref_stride, \
-                                       uint32_t *sse) { \
-  int sum; \
-  highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-} \
-\
-uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
-                                        int src_stride, \
-                                        const uint8_t *ref, \
-                                        int ref_stride, \
-                                        uint32_t *sse) { \
-  int sum; \
-  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-} \
-\
-uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
-                                        int src_stride, \
-                                        const uint8_t *ref, \
-                                        int ref_stride, \
-                                        uint32_t *sse) { \
-  int sum; \
-  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-}
+#define HIGHBD_MSE(W, H)                                                      \
+  uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
+                                         const uint8_t *ref, int ref_stride,  \
+                                         uint32_t *sse) {                     \
+    int sum;                                                                  \
+    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }
 
 void vpx_highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
     const uint8_t *filter) {
   unsigned int i, j;
   uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      output_ptr[j] =
-          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
-                             (int)src_ptr[pixel_step] * filter[1],
-                             FILTER_BITS);
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
 
       ++src_ptr;
     }
@@ -465,21 +436,17 @@ void vpx_highbd_var_filter_block2d_bil_first_pass(
 }
 
 void vpx_highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
     const uint8_t *filter) {
-  unsigned int  i, j;
+  unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      output_ptr[j] =
-          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
-                             (int)src_ptr[pixel_step] * filter[1],
-                             FILTER_BITS);
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
       ++src_ptr;
     }
 
@@ -488,136 +455,118 @@ void vpx_highbd_var_filter_block2d_bil_second_pass(
   }
 }
 
-#define HIGHBD_SUBPIX_VAR(W, H) \
-uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  uint32_t *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass( \
-      src, fdata3, src_stride, 1, H + 1, \
-      W, bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                            dst_stride, sse); \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  uint32_t *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass( \
-      src, fdata3, src_stride, 1, H + 1, \
-      W, bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                             W, dst, dst_stride, sse); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  uint32_t *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass( \
-      src, fdata3, src_stride, 1, H + 1, \
-      W, bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                             W, dst, dst_stride, sse); \
-}
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
+                                              dst, dst_stride, sse);         \
+  }                                                                          \
+                                                                             \
+  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }                                                                          \
+                                                                             \
+  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
-uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  uint32_t *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass( \
-      src, fdata3, src_stride, 1, H + 1, \
-      W, bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
-                             CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                            dst_stride, sse);           \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  uint32_t *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass( \
-      src, fdata3, src_stride, 1, H + 1, \
-      W, bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
-                             CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
-                                             W, dst, dst_stride, sse); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  uint32_t *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass( \
-      src, fdata3, src_stride, 1, H + 1, \
-      W, bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
-                             CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
-                                             W, dst, dst_stride, sse); \
-}
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
+  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
+                                                                             \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
+                                                                             \
+    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                              dst, dst_stride, sse);         \
+  }                                                                          \
+                                                                             \
+  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
+                                                                             \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
+                                                                             \
+    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+                                               dst, dst_stride, sse);        \
+  }                                                                          \
+                                                                             \
+  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
+                                                                             \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
+                                                                             \
+    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+                                               dst, dst_stride, sse);        \
+  }
 
 /* All three forms of the variance are available in the same sizes. */
 #define HIGHBD_VARIANCES(W, H) \
-    HIGHBD_VAR(W, H) \
-    HIGHBD_SUBPIX_VAR(W, H) \
-    HIGHBD_SUBPIX_AVG_VAR(W, H)
+  HIGHBD_VAR(W, H)             \
+  HIGHBD_SUBPIX_VAR(W, H)      \
+  HIGHBD_SUBPIX_AVG_VAR(W, H)
 
 #if CONFIG_VP10 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(128, 128)
@@ -663,10 +612,8 @@ void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
   }
 }
 
-void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred,
-                                 int width, int height,
-                                 const uint8_t *ref8,
-                                 int ref_stride) {
+void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
+                                 const uint8_t *ref8, int ref_stride) {
   int i, j;
   int stride = ref_stride << 3;
 
@@ -681,9 +628,8 @@ void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred,
 }
 
 void vpx_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
-                                          const uint8_t *pred8,
-                                          int width, int height,
-                                          const uint8_t *ref8,
+                                          const uint8_t *pred8, int width,
+                                          int height, const uint8_t *ref8,
                                           int ref_stride) {
   int i, j;
   int stride = ref_stride << 3;
@@ -703,10 +649,9 @@ void vpx_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP10 && CONFIG_EXT_INTER
-void masked_variance(const uint8_t *a, int  a_stride,
-                     const uint8_t *b, int  b_stride,
-                     const uint8_t *m, int  m_stride,
-                     int  w, int  h, unsigned int *sse, int *sum) {
+void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, const uint8_t *m, int m_stride, int w, int h,
+                     unsigned int *sse, int *sum) {
   int i, j;
 
   int64_t sum64 = 0;
@@ -723,39 +668,36 @@ void masked_variance(const uint8_t *a, int  a_stride,
     b += b_stride;
     m += m_stride;
   }
-  sum64 = (sum64 >= 0) ? sum64  : -sum64;
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
 }
 
-#define MASK_VAR(W, H) \
-unsigned int vpx_masked_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                              const uint8_t *b, int b_stride, \
-                                              const uint8_t *m, int m_stride, \
-                                              unsigned int *sse) { \
-  int sum; \
-  masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
+#define MASK_VAR(W, H)                                                       \
+  unsigned int vpx_masked_variance##W##x##H##_c(                             \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }
 
-#define MASK_SUBPIX_VAR(W, H) \
-unsigned int vpx_masked_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  const uint8_t *msk, int msk_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-\
-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    bilinear_filters_2t[xoffset]); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters_2t[yoffset]); \
-\
-  return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \
-                                          msk, msk_stride, sse); \
-}
+#define MASK_SUBPIX_VAR(W, H)                                                 \
+  unsigned int vpx_masked_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+                                                                              \
+    var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W,   \
+                                      bilinear_filters_2t[xoffset]);          \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,             \
+                                       bilinear_filters_2t[yoffset]);         \
+                                                                              \
+    return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk,   \
+                                            msk_stride, sse);                 \
+  }
 
 MASK_VAR(4, 4)
 MASK_SUBPIX_VAR(4, 4)
@@ -808,11 +750,10 @@ MASK_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void highbd_masked_variance64(const uint8_t *a8, int  a_stride,
-                              const uint8_t *b8, int  b_stride,
-                              const uint8_t *m, int  m_stride,
-                              int  w, int  h,
-                              uint64_t *sse, int64_t *sum) {
+void highbd_masked_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, const uint8_t *m,
+                              int m_stride, int w, int h, uint64_t *sse,
+                              int64_t *sum) {
   int i, j;
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
@@ -831,150 +772,122 @@ void highbd_masked_variance64(const uint8_t *a8, int  a_stride,
     b += b_stride;
     m += m_stride;
   }
-  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = (*sum >= 0) ? *sum : -*sum;
   *sum = ROUND_POWER_OF_TWO(*sum, 6);
   *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
-void highbd_masked_variance(const uint8_t *a8, int  a_stride,
-                            const uint8_t *b8, int  b_stride,
-                            const uint8_t *m, int  m_stride,
-                            int  w, int  h,
-                            unsigned int *sse, int *sum) {
+void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, const uint8_t *m, int m_stride, int w,
+                            int h, unsigned int *sse, int *sum) {
   int64_t sum64;
   uint64_t sse64;
-  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, &sum64);
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+                           &sse64, &sum64);
   *sum = (int)sum64;
   *sse = (unsigned int)sse64;
 }
 
-void highbd_10_masked_variance(const uint8_t *a8, int  a_stride,
-                               const uint8_t *b8, int  b_stride,
-                               const uint8_t *m, int  m_stride,
-                               int  w, int  h,
+void highbd_10_masked_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride,
+                               const uint8_t *m, int m_stride, int w, int h,
                                unsigned int *sse, int *sum) {
   int64_t sum64;
   uint64_t sse64;
-  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, &sum64);
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+                           &sse64, &sum64);
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
 }
 
-void highbd_12_masked_variance(const uint8_t *a8, int  a_stride,
-                               const uint8_t *b8, int  b_stride,
-                               const uint8_t *m, int  m_stride,
-                               int  w, int  h,
+void highbd_12_masked_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride,
+                               const uint8_t *m, int m_stride, int w, int h,
                                unsigned int *sse, int *sum) {
   int64_t sum64;
   uint64_t sse64;
-  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
-                           w, h, &sse64, &sum64);
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+                           &sse64, &sum64);
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
 }
 
-#define HIGHBD_MASK_VAR(W, H) \
-unsigned int vpx_highbd_masked_variance##W##x##H##_c(const uint8_t *a, \
-                                                     int a_stride, \
-                                                     const uint8_t *b, \
-                                                     int b_stride, \
-                                                     const uint8_t *m, \
-                                                     int m_stride, \
-                                                     unsigned int *sse) { \
-  int sum; \
-  highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
-                         W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vpx_highbd_10_masked_variance##W##x##H##_c(const uint8_t *a, \
-                                                        int a_stride, \
-                                                        const uint8_t *b, \
-                                                        int b_stride, \
-                                                        const uint8_t *m, \
-                                                        int m_stride, \
-                                                        unsigned int *sse) { \
-  int sum; \
-  highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
-                            W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vpx_highbd_12_masked_variance##W##x##H##_c(const uint8_t *a, \
-                                                        int a_stride, \
-                                                        const uint8_t *b, \
-                                                        int b_stride, \
-                                                        const uint8_t *m, \
-                                                        int m_stride, \
-                                                        unsigned int *sse) { \
-  int sum; \
-  highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
-                            W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
+#define HIGHBD_MASK_VAR(W, H)                                                \
+  unsigned int vpx_highbd_masked_variance##W##x##H##_c(                      \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \
+                           &sum);                                            \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }                                                                          \
+                                                                             \
+  unsigned int vpx_highbd_10_masked_variance##W##x##H##_c(                   \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H,   \
+                              sse, &sum);                                    \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }                                                                          \
+                                                                             \
+  unsigned int vpx_highbd_12_masked_variance##W##x##H##_c(                   \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H,   \
+                              sse, &sum);                                    \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }
 
-#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
-unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  const uint8_t *msk, int msk_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                               H + 1, W, \
-                                               bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                                 W, dst, dst_stride, \
-                                                 msk, msk_stride, sse); \
-} \
-\
-unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  const uint8_t *msk, int msk_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                               H + 1, W, \
-                                               bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                                    W, dst, dst_stride, \
-                                                    msk, msk_stride, sse); \
-} \
-\
-unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  const uint8_t *msk, int msk_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                               H + 1, W, \
-                                               bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                                bilinear_filters_2t[yoffset]); \
-\
-  return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                                    W, dst, dst_stride, \
-                                                    msk, msk_stride, sse); \
-}
+#define HIGHBD_MASK_SUBPIX_VAR(W, H)                                          \
+  unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+                                                                              \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    return vpx_highbd_masked_variance##W##x##H##_c(                           \
+        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+  }                                                                           \
+                                                                              \
+  unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_c(          \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+                                                                              \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    return vpx_highbd_10_masked_variance##W##x##H##_c(                        \
+        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+  }                                                                           \
+                                                                              \
+  unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_c(          \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+                                                                              \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    return vpx_highbd_12_masked_variance##W##x##H##_c(                        \
+        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+  }
 
 HIGHBD_MASK_VAR(4, 4)
 HIGHBD_MASK_SUBPIX_VAR(4, 4)
@@ -1029,7 +942,7 @@ HIGHBD_MASK_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
 
 #if CONFIG_VP10 && CONFIG_OBMC
-static INLINE void obmc_variance(const uint8_t *pre, int  pre_stride,
+static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int w, int h, unsigned int *sse, int *sum) {
   int i, j;
@@ -1050,35 +963,29 @@ static INLINE void obmc_variance(const uint8_t *pre, int  pre_stride,
   }
 }
 
-#define OBMC_VAR(W, H) \
-unsigned int vpx_obmc_variance##W##x##H##_c(const uint8_t *pre,               \
-                                            int pre_stride,                   \
-                                            const int32_t *wsrc,              \
-                                            const int32_t *mask,              \
-                                            unsigned int *sse) {              \
-  int sum;                                                                    \
-  obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);                \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}
+#define OBMC_VAR(W, H)                                           \
+  unsigned int vpx_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,   \
+      const int32_t *mask, unsigned int *sse) {                  \
+    int sum;                                                     \
+    obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    return *sse - (((int64_t)sum * sum) / (W * H));              \
+  }
 
-#define OBMC_SUBPIX_VAR(W, H) \
-unsigned int vpx_obmc_sub_pixel_variance##W##x##H##_c(const uint8_t *pre,     \
-                                                      int pre_stride,         \
-                                                      int xoffset,            \
-                                                      int yoffset,            \
-                                                      const int32_t *wsrc,    \
-                                                      const int32_t *mask,    \
-                                                      unsigned int *sse) {    \
-  uint16_t fdata3[(H + 1) * W];                                               \
-  uint8_t temp2[H * W];                                                       \
-                                                                              \
-  var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W,     \
-                                    bilinear_filters_2t[xoffset]);            \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,               \
-                                     bilinear_filters_2t[yoffset]);           \
-                                                                              \
-  return vpx_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);           \
-}
+#define OBMC_SUBPIX_VAR(W, H)                                               \
+  unsigned int vpx_obmc_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,         \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {        \
+    uint16_t fdata3[(H + 1) * W];                                           \
+    uint8_t temp2[H * W];                                                   \
+                                                                            \
+    var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \
+                                      bilinear_filters_2t[xoffset]);        \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,           \
+                                       bilinear_filters_2t[yoffset]);       \
+                                                                            \
+    return vpx_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);       \
+  }
 
 OBMC_VAR(4, 4)
 OBMC_SUBPIX_VAR(4, 4)
@@ -1133,8 +1040,7 @@ OBMC_SUBPIX_VAR(128, 128)
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
-                                          const int32_t *mask,
-                                          int w, int h,
+                                          const int32_t *mask, int w, int h,
                                           uint64_t *sse, int64_t *sum) {
   int i, j;
   uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
@@ -1157,8 +1063,7 @@ static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
 
 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
                                         const int32_t *wsrc,
-                                        const int32_t *mask,
-                                        int w, int h,
+                                        const int32_t *mask, int w, int h,
                                         unsigned int *sse, int *sum) {
   int64_t sum64;
   uint64_t sse64;
@@ -1167,10 +1072,9 @@ static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
   *sse = (unsigned int)sse64;
 }
 
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int  pre_stride,
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           int w, int h,
+                                           const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
   int64_t sum64;
   uint64_t sse64;
@@ -1179,10 +1083,9 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int  pre_stride,
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
 }
 
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int  pre_stride,
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           int w, int h,
+                                           const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
   int64_t sum64;
   uint64_t sse64;
@@ -1191,94 +1094,76 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int  pre_stride,
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
 }
 
-#define HIGHBD_OBMC_VAR(W, H)                                                 \
-unsigned int vpx_highbd_obmc_variance##W##x##H##_c(const uint8_t *pre,        \
-                                                   int pre_stride,            \
-                                                   const int32_t *wsrc,       \
-                                                   const int32_t *mask,       \
-                                                   unsigned int *sse) {       \
-  int sum;                                                                    \
-  highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);         \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}                                                                             \
-                                                                              \
-unsigned int vpx_highbd_10_obmc_variance##W##x##H##_c(const uint8_t *pre,     \
-                                                      int pre_stride,         \
-                                                      const int32_t *wsrc,    \
-                                                      const int32_t *mask,    \
-                                                      unsigned int *sse) {    \
-  int sum;                                                                    \
-  highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}                                                                             \
-                                                                              \
-unsigned int vpx_highbd_12_obmc_variance##W##x##H##_c(const uint8_t *pre,     \
-                                                      int pre_stride,         \
-                                                      const int32_t *wsrc,    \
-                                                      const int32_t *mask,    \
-                                                      unsigned int *sse) {    \
-  int sum;                                                                    \
-  highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}
+#define HIGHBD_OBMC_VAR(W, H)                                              \
+  unsigned int vpx_highbd_obmc_variance##W##x##H##_c(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    return *sse - (((int64_t)sum * sum) / (W * H));                        \
+  }                                                                        \
+                                                                           \
+  unsigned int vpx_highbd_10_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    return *sse - (((int64_t)sum * sum) / (W * H));                        \
+  }                                                                        \
+                                                                           \
+  unsigned int vpx_highbd_12_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    return *sse - (((int64_t)sum * sum) / (W * H));                        \
+  }
 
-#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                          \
-unsigned int vpx_highbd_obmc_sub_pixel_variance##W##x##H##_c(                 \
-                                        const uint8_t *pre, int pre_stride,   \
-                                        int xoffset, int  yoffset,            \
-                                        const int32_t *wsrc,                  \
-                                        const int32_t *mask,                  \
-                                        unsigned int *sse) {                  \
-  uint16_t fdata3[(H + 1) * W];                                               \
-  uint16_t temp2[H * W];                                                      \
-                                                                              \
-  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
-                                               H + 1, W,                      \
-                                               bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
-                                               bilinear_filters_2t[yoffset]); \
-                                                                              \
-  return vpx_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),     \
-                                               W, wsrc, mask, sse);           \
-}                                                                             \
-                                                                              \
-unsigned int vpx_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(              \
-                                        const uint8_t *pre, int pre_stride,   \
-                                        int xoffset, int  yoffset,            \
-                                        const int32_t *wsrc,                  \
-                                        const int32_t *mask,                  \
-                                        unsigned int *sse) {                  \
-  uint16_t fdata3[(H + 1) * W];                                               \
-  uint16_t temp2[H * W];                                                      \
-                                                                              \
-  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
-                                               H + 1, W,                      \
-                                               bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
-                                               bilinear_filters_2t[yoffset]); \
-                                                                              \
-  return vpx_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
-                                                  W, wsrc, mask, sse);        \
-}                                                                             \
-                                                                              \
-unsigned int vpx_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(              \
-                                        const uint8_t *pre, int pre_stride,   \
-                                        int xoffset, int  yoffset,            \
-                                        const int32_t *wsrc,                  \
-                                        const int32_t *mask,                  \
-                                        unsigned int *sse) {                  \
-  uint16_t fdata3[(H + 1) * W];                                               \
-  uint16_t temp2[H * W];                                                      \
-                                                                              \
-  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
-                                               H + 1, W,                      \
-                                               bilinear_filters_2t[xoffset]); \
-  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
-                                               bilinear_filters_2t[yoffset]); \
-                                                                              \
-  return vpx_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
-                                                  W, wsrc, mask, sse);        \
-}
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
+  unsigned int vpx_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return vpx_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                                 wsrc, mask, sse);             \
+  }                                                                            \
+                                                                               \
+  unsigned int vpx_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return vpx_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }                                                                            \
+                                                                               \
+  unsigned int vpx_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    vpx_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    vpx_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return vpx_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }
 
 HIGHBD_OBMC_VAR(4, 4)
 HIGHBD_OBMC_SUBPIX_VAR(4, 4)
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 837fc3dbcbc1aa25829ae951d03e2bdfe46a1313..18b62dbdedc8f2f0b9f962b6d915e773e5b97ecf 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -22,15 +22,15 @@ extern "C" {
 #define FILTER_BITS 7
 #define FILTER_WEIGHT 128
 
-typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride);
+typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride);
 
-typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride,
-                                        const uint8_t *b, int b_stride,
-                                        const uint8_t *second_pred);
+typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         const uint8_t *second_pred);
 
-typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
-                                  uint8_t *b, int b_stride, int n);
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
+                                  int b_stride, int n);
 
 typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
@@ -38,8 +38,7 @@ typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
 
 typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
                                      const uint8_t *const b_array[],
-                                     int b_stride,
-                                     unsigned int *sad_array);
+                                     int b_stride, unsigned int *sad_array);
 
 typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
                                           const uint8_t *b, int b_stride,
@@ -50,95 +49,71 @@ typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
                                                 const uint8_t *b, int b_stride,
                                                 unsigned int *sse);
 
-typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a,
-                                                   int a_stride,
-                                                   int xoffset, int yoffset,
-                                                   const uint8_t *b,
-                                                   int b_stride,
-                                                   unsigned int *sse,
-                                                   const uint8_t *second_pred);
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred);
 
 #if CONFIG_VP10 && CONFIG_EXT_INTER
-typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src,
-                                           int src_stride,
-                                           const uint8_t *ref,
-                                           int ref_stride,
-                                           const uint8_t *msk_ptr,
-                                           int msk_stride);
-typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src,
-                                                 int src_stride,
-                                                 const uint8_t *ref,
-                                                 int ref_stride,
-                                                 const uint8_t *msk,
-                                                 int msk_stride,
-                                                 unsigned int *sse);
-typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src,
-                                                       int src_stride,
-                                                       int xoffset, int yoffset,
-                                                       const uint8_t *ref,
-                                                       int ref_stride,
-                                                       const uint8_t *msk,
-                                                       int msk_stride,
-                                                       unsigned int *sse);
+typedef unsigned int (*vpx_masked_sad_fn_t)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *msk_ptr,
+                                            int msk_stride);
+typedef unsigned int (*vpx_masked_variance_fn_t)(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *msk, int msk_stride, unsigned int *sse);
+typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse);
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
 
 #if CONFIG_VP10 && CONFIG_OBMC
-typedef unsigned int(*vpx_obmc_sad_fn_t)(const uint8_t *pred,
-                                         int pred_stride,
-                                         const int32_t *wsrc,
-                                         const int32_t *msk);
+typedef unsigned int (*vpx_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *msk);
 typedef unsigned int (*vpx_obmc_variance_fn_t)(const uint8_t *pred,
                                                int pred_stride,
                                                const int32_t *wsrc,
                                                const int32_t *msk,
                                                unsigned int *sse);
-typedef unsigned int (*vpx_obmc_subpixvariance_fn_t)(const uint8_t *pred,
-                                                     int pred_stride,
-                                                     int xoffset, int yoffset,
-                                                     const int32_t *wsrc,
-                                                     const int32_t *msk,
-                                                     unsigned int *sse);
+typedef unsigned int (*vpx_obmc_subpixvariance_fn_t)(
+    const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
+    const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
 #endif  // CONFIG_VP10 && CONFIG_OBMC
 
 #if CONFIG_VP10
 typedef struct vpx_variance_vtable {
-  vpx_sad_fn_t                   sdf;
-  vpx_sad_avg_fn_t               sdaf;
-  vpx_variance_fn_t              vf;
-  vpx_subpixvariance_fn_t        svf;
-  vpx_subp_avg_variance_fn_t     svaf;
-  vpx_sad_multi_fn_t             sdx3f;
-  vpx_sad_multi_fn_t             sdx8f;
-  vpx_sad_multi_d_fn_t           sdx4df;
+  vpx_sad_fn_t sdf;
+  vpx_sad_avg_fn_t sdaf;
+  vpx_variance_fn_t vf;
+  vpx_subpixvariance_fn_t svf;
+  vpx_subp_avg_variance_fn_t svaf;
+  vpx_sad_multi_fn_t sdx3f;
+  vpx_sad_multi_fn_t sdx8f;
+  vpx_sad_multi_d_fn_t sdx4df;
 #if CONFIG_EXT_INTER
-  vpx_masked_sad_fn_t            msdf;
-  vpx_masked_variance_fn_t       mvf;
+  vpx_masked_sad_fn_t msdf;
+  vpx_masked_variance_fn_t mvf;
   vpx_masked_subpixvariance_fn_t msvf;
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_OBMC
-  vpx_obmc_sad_fn_t              osdf;
-  vpx_obmc_variance_fn_t         ovf;
-  vpx_obmc_subpixvariance_fn_t   osvf;
+  vpx_obmc_sad_fn_t osdf;
+  vpx_obmc_variance_fn_t ovf;
+  vpx_obmc_subpixvariance_fn_t osvf;
 #endif  // CONFIG_OBMC
 } vpx_variance_fn_ptr_t;
 #endif  // CONFIG_VP10
 
 void vpx_highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
     const uint8_t *filter);
 
 void vpx_highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
     const uint8_t *filter);
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c
index 59d0488122fd15ac2ab6dd45b876659daace0f08..20d83640cab358ff693d54df668d5c9be1d9aa1b 100644
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -21,8 +21,8 @@
 
 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const InterpKernel *x_filters,
-                           int x0_q4, int x_step_q4, int w, int h) {
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
   for (y = 0; y < h; ++y) {
@@ -31,8 +31,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
       int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       x_q4 += x_step_q4;
     }
@@ -43,8 +42,8 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
 
 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *x_filters,
-                               int x0_q4, int x_step_q4, int w, int h) {
+                               const InterpKernel *x_filters, int x0_q4,
+                               int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
   for (y = 0; y < h; ++y) {
@@ -53,10 +52,9 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
       int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
+          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -66,8 +64,8 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
 
 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                          const InterpKernel *y_filters,
-                          int y0_q4, int y_step_q4, int w, int h) {
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
@@ -89,8 +87,8 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
 
 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *y_filters,
-                              int y0_q4, int y_step_q4, int w, int h) {
+                              const InterpKernel *y_filters, int y0_q4,
+                              int y_step_q4, int w, int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
@@ -102,8 +100,10 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+          dst[y * dst_stride] +
+              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+          1);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -111,13 +111,11 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const InterpKernel *const x_filters,
+static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
                      int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters,
-                     int y0_q4, int y_step_q4,
-                     int w, int h) {
+                     const InterpKernel *const y_filters, int y0_q4,
+                     int y_step_q4, int w, int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -132,7 +130,7 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
-          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= MAX_SB_SIZE);
   assert(h <= MAX_SB_SIZE);
@@ -140,12 +138,11 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                 temp, MAX_SB_SIZE,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-                dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+                 intermediate_height);
+  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
+                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
 }
 
 static const InterpKernel *get_filter_base(const int16_t *filter) {
@@ -161,67 +158,66 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h) {
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
   (void)filter_y;
   (void)y_step_q4;
 
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
-                 x0_q4, x_step_q4, w, h);
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+                 w, h);
 }
 
 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
   (void)filter_y;
   (void)y_step_q4;
 
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
-                     x0_q4, x_step_q4, w, h);
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                     x_step_q4, w, h);
 }
 
 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h) {
+                          const int16_t *filter_y, int y_step_q4, int w,
+                          int h) {
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
   (void)filter_x;
   (void)x_step_q4;
 
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
-                y0_q4, y_step_q4, w, h);
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+                w, h);
 }
 
 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
   (void)filter_x;
   (void)x_step_q4;
 
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
-                    y0_q4, y_step_q4, w, h);
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                    y_step_q4, w, h);
 }
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const int16_t *filter_x,
+                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
@@ -229,36 +225,35 @@ void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
-  convolve(src, src_stride, dst, dst_stride,
-           filters_x, x0_q4, x_step_q4,
+  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
            filters_y, y0_q4, y_step_q4, w, h);
 }
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
   DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
   assert(w <= MAX_SB_SIZE);
   assert(h <= MAX_SB_SIZE);
 
-  vpx_convolve8_c(src, src_stride, temp, MAX_SB_SIZE,
-                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride,
-                     NULL, 0, NULL, 0, w, h);
+  vpx_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
+                  filter_y, y_step_q4, w, h);
+  vpx_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
+                     h);
 }
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int filter_x_stride,
-                         const int16_t *filter_y, int filter_y_stride,
-                         int w, int h) {
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int filter_x_stride, const int16_t *filter_y,
+                         int filter_y_stride, int w, int h) {
   int r;
 
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   for (r = h; r > 0; --r) {
     memcpy(dst, src, w);
@@ -267,47 +262,44 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int filter_x_stride,
-                        const int16_t *filter_y, int filter_y_stride,
-                        int w, int h) {
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
+                        int filter_x_stride, const int16_t *filter_y,
+                        int filter_y_stride, int w, int h) {
   int x, y;
 
-  (void)filter_x;  (void)filter_x_stride;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x)
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
 
     src += src_stride;
     dst += dst_stride;
   }
 }
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                        uint8_t *dst, ptrdiff_t dst_stride,
-                        const int16_t *filter_x, int x_step_q4,
-                        const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
+                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
   vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                         filter_y, y_step_q4, w, h);
 }
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, ptrdiff_t dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const int16_t *filter_x,
+                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
                        int w, int h) {
   vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                        filter_y, y_step_q4, w, h);
 }
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const int16_t *filter_x,
+                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
   vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                   filter_y, y_step_q4, w, h);
@@ -316,8 +308,8 @@ void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h) {
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
   vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
                             x_step_q4, filter_y, y_step_q4, w, h);
 }
@@ -325,17 +317,16 @@ void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
-                           int w, int h) {
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
   vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
                            x_step_q4, filter_y, y_step_q4, w, h);
 }
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
-                     uint8_t *dst, ptrdiff_t dst_stride,
-                     const int16_t *filter_x, int x_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
   vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                       filter_y, y_step_q4, w, h);
 }
@@ -343,9 +334,8 @@ void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const InterpKernel *x_filters,
-                                  int x0_q4, int x_step_q4,
-                                  int w, int h, int bd) {
+                                  const InterpKernel *x_filters, int x0_q4,
+                                  int x_step_q4, int w, int h, int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -356,8 +346,7 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
       int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       x_q4 += x_step_q4;
     }
@@ -368,9 +357,8 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
 
 static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                       uint8_t *dst8, ptrdiff_t dst_stride,
-                                      const InterpKernel *x_filters,
-                                      int x0_q4, int x_step_q4,
-                                      int w, int h, int bd) {
+                                      const InterpKernel *x_filters, int x0_q4,
+                                      int x_step_q4, int w, int h, int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -381,10 +369,10 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
       int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
+          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+          1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -394,9 +382,8 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
 
 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                 const InterpKernel *y_filters,
-                                 int y0_q4, int y_step_q4, int w, int h,
-                                 int bd) {
+                                 const InterpKernel *y_filters, int y0_q4,
+                                 int y_step_q4, int w, int h, int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -409,8 +396,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -420,9 +407,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
 
 static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                      uint8_t *dst8, ptrdiff_t dst_stride,
-                                     const InterpKernel *y_filters,
-                                     int y0_q4, int y_step_q4, int w, int h,
-                                     int bd) {
+                                     const InterpKernel *y_filters, int y0_q4,
+                                     int y_step_q4, int w, int h, int bd) {
   int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -435,8 +421,10 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
       int k, sum = 0;
       for (k = 0; k < SUBPEL_TAPS; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+          dst[y * dst_stride] +
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+          1);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -446,11 +434,9 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
 
 static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters,
-                            int x0_q4, int x_step_q4,
-                            const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4,
-                            int w, int h, int bd) {
+                            const InterpKernel *const x_filters, int x0_q4,
+                            int x_step_q4, const InterpKernel *const y_filters,
+                            int y0_q4, int y_step_q4, int w, int h, int bd) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -465,7 +451,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
-          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= MAX_SB_SIZE);
   assert(h <= MAX_SB_SIZE);
@@ -473,28 +459,25 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 <= 32);
 
   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
-                        x_filters, x0_q4, x_step_q4, w,
-                        intermediate_height, bd);
+                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
+                        x_step_q4, w, intermediate_height, bd);
   highbd_convolve_vert(
-    CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-    dst, dst_stride,
-    y_filters, y0_q4, y_step_q4, w, h, bd);
+      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
 }
 
-
 void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
-                                  int w, int h, int bd) {
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h, int bd) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
   (void)filter_y;
   (void)y_step_q4;
 
-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
-                        x0_q4, x_step_q4, w, h, bd);
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                        x_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -507,22 +490,22 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
   (void)filter_y;
   (void)y_step_q4;
 
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
-                            x0_q4, x_step_q4, w, h, bd);
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                            x_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h, int bd) {
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h, int bd) {
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
   (void)filter_x;
   (void)x_step_q4;
 
-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
-                       y0_q4, y_step_q4, w, h, bd);
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                       y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -535,42 +518,39 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
   (void)filter_x;
   (void)x_step_q4;
 
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
-                           y0_q4, y_step_q4, w, h, bd);
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                           y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
-                            int w, int h, int bd) {
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h, int bd) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
 
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
-  highbd_convolve(src, src_stride, dst, dst_stride,
-                  filters_x, x0_q4, x_step_q4,
+  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
                   filters_y, y0_q4, y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h, int bd) {
+                                const int16_t *filter_y, int y_step_q4, int w,
+                                int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
   assert(w <= MAX_SB_SIZE);
   assert(h <= MAX_SB_SIZE);
 
-  vpx_highbd_convolve8_c(src, src_stride,
-                         CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
+  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
-                            dst, dst_stride,
-                            NULL, 0, NULL, 0, w, h, bd);
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
+                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
 }
 
 void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h
index bd8679d1095112bd63f54ebb9a395789fc618fc0..1da2c1f7cca33bc5df26ecae30dc1babc58d7818 100644
--- a/vpx_dsp/vpx_convolve.h
+++ b/vpx_dsp/vpx_convolve.h
@@ -30,16 +30,16 @@ extern "C" {
 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
 #if CONFIG_VP10 && CONFIG_EXT_PARTITION
-# define MAX_EXT_SIZE 263
+#define MAX_EXT_SIZE 263
 #else
-# define MAX_EXT_SIZE 135
+#define MAX_EXT_SIZE 135
 #endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h);
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 4648c34e57e0c674ccc6c03d166a6f04c8122a0b..5706cad542979c0bd252a97d8e35caa924d44d3c 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -20,30 +20,30 @@ extern "C" {
 #endif
 
 #ifndef MAX_SB_SIZE
-# if CONFIG_VP10 && CONFIG_EXT_PARTITION
-#   define MAX_SB_SIZE 128
-# else
-#   define MAX_SB_SIZE 64
-# endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE 128
+#else
+#define MAX_SB_SIZE 64
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 #endif  // ndef MAX_SB_SIZE
 
 #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
 
-#define IMPLIES(a, b)  (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
+#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
 
-#define IS_POWER_OF_TWO(x)  (((x) & ((x) - 1)) == 0)
+#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
 
 // These can be used to give a hint about branch outcomes.
 // This can have an effect, even if your target processor has a
 // good branch predictor, as these hints can affect basic block
 // ordering by the compiler.
 #ifdef __GNUC__
-# define LIKELY(v)    __builtin_expect(v, 1)
-# define UNLIKELY(v)  __builtin_expect(v, 0)
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
 #else
-# define LIKELY(v)    (v)
-# define UNLIKELY(v)  (v)
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
 #endif
 
 #define VPX_SWAP(type, a, b) \
@@ -83,12 +83,9 @@ static INLINE double fclamp(double value, double low, double high) {
 static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   switch (bd) {
     case 8:
-    default:
-      return (uint16_t)clamp(val, 0, 255);
-    case 10:
-      return (uint16_t)clamp(val, 0, 1023);
-    case 12:
-      return (uint16_t)clamp(val, 0, 4095);
+    default: return (uint16_t)clamp(val, 0, 255);
+    case 10: return (uint16_t)clamp(val, 0, 1023);
+    case 12: return (uint16_t)clamp(val, 0, 4095);
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp_rtcd.c b/vpx_dsp/vpx_dsp_rtcd.c
index 5fe27b614bdcd7be4df2ea700f16190d1258ab27..030c456d391777b0ed3b365f7a13f81648a09a5d 100644
--- a/vpx_dsp/vpx_dsp_rtcd.c
+++ b/vpx_dsp/vpx_dsp_rtcd.c
@@ -12,6 +12,4 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_dsp_rtcd() {
-  once(setup_rtcd_internal);
-}
+void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
index cfe8161961ee718b1932a7ec8b6709a5a601985f..157daea988211aa5643100690bd71bd58526382c 100644
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -13,7 +13,6 @@
 
 #include "vpx/vpx_integer.h"
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -27,19 +26,13 @@ extern "C" {
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
-#define BIL_SUBPEL_BITS    3
-#define BIL_SUBPEL_SHIFTS  (1 << BIL_SUBPEL_BITS)
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
 
 // 2 tap bilinear filters
 static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
-  { 128,   0  },
-  { 112,  16  },
-  {  96,  32  },
-  {  80,  48  },
-  {  64,  64  },
-  {  48,  80  },
-  {  32,  96  },
-  {  16, 112  },
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index ecc215a8e79cc98291970473a645b34283900d0f..9eafc6ce9ed6c19274c3e8fbca76511c120881b5 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -18,7 +18,7 @@
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
-  u0  = _mm_setzero_si128();
+  u0 = _mm_setzero_si128();
   // Row 0
   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
@@ -96,7 +96,7 @@ void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
 unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
-  u0  = _mm_setzero_si128();
+  u0 = _mm_setzero_si128();
   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
   s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
   s0 = _mm_adds_epu16(s0, s1);
@@ -124,7 +124,7 @@ unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
 
-  u0  = _mm_setzero_si128();
+  u0 = _mm_setzero_si128();
   s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0);
   s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0);
   s0 = _mm_adds_epu16(s0, s1);
@@ -251,8 +251,8 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
                              int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
-                                + (idx & 0x01) * 8;
+    int16_t const *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
     vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
   }
 
@@ -312,7 +312,7 @@ int vpx_satd_sse2(const int16_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
@@ -381,8 +381,7 @@ int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
   return _mm_extract_epi16(s0, 0);
 }
 
-int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
-                        const int bwl) {
+int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
   int idx;
   int width = 4 << bwl;
   int16_t mean;
@@ -401,23 +400,23 @@ int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
     diff = _mm_subs_epi16(v0, v1);
 
     sum = _mm_add_epi16(sum, diff);
-    v0  = _mm_madd_epi16(diff, diff);
+    v0 = _mm_madd_epi16(diff, diff);
     sse = _mm_add_epi32(sse, v0);
 
     ref += 8;
     src += 8;
   }
 
-  v0  = _mm_srli_si128(sum, 8);
+  v0 = _mm_srli_si128(sum, 8);
   sum = _mm_add_epi16(sum, v0);
-  v0  = _mm_srli_epi64(sum, 32);
+  v0 = _mm_srli_epi64(sum, 32);
   sum = _mm_add_epi16(sum, v0);
-  v0  = _mm_srli_epi32(sum, 16);
+  v0 = _mm_srli_epi32(sum, 16);
   sum = _mm_add_epi16(sum, v0);
 
-  v1  = _mm_srli_si128(sse, 8);
+  v1 = _mm_srli_si128(sse, 8);
   sse = _mm_add_epi32(sse, v1);
-  v1  = _mm_srli_epi64(sse, 32);
+  v1 = _mm_srli_epi64(sse, 32);
   sse = _mm_add_epi32(sse, v1);
 
   mean = _mm_extract_epi16(sum, 0);
diff --git a/vpx_dsp/x86/blend_a64_hmask_sse4.c b/vpx_dsp/x86/blend_a64_hmask_sse4.c
index a10e0771bc92acc2e48bc218ff7e73d6f44f7c57..892323463cdc49313bcad37ca3f433b867ca3742 100644
--- a/vpx_dsp/x86/blend_a64_hmask_sse4.c
+++ b/vpx_dsp/x86/blend_a64_hmask_sse4.c
@@ -15,27 +15,21 @@
 // To start out, just dispatch to the function using the 2D mask and
 // pass mask stride as 0. This can be improved upon if necessary.
 
-void vpx_blend_a64_hmask_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
-  vpx_blend_a64_mask_sse4_1(dst, dst_stride,
-                            src0, src0_stride,
-                            src1, src1_stride,
-                            mask, 0, h, w, 0, 0);
+void vpx_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int h, int w) {
+  vpx_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, 0, h, w, 0, 0);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_blend_a64_hmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride,
-    const uint8_t *src0_8, uint32_t src0_stride,
-    const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w,
-    int bd) {
-  vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride,
-                                   src0_8, src0_stride,
-                                   src1_8, src1_stride,
-                                   mask, 0, h, w, 0, 0, bd);
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
+                                   src1_8, src1_stride, mask, 0, h, w, 0, 0,
+                                   bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_a64_mask_sse4.c b/vpx_dsp/x86/blend_a64_mask_sse4.c
index 6aa89fa2ed1d9e5c9398b1dc5785cf4dc6abf555..a5c6de5de4b3eaf479902dfa89976d458544a60a 100644
--- a/vpx_dsp/x86/blend_a64_mask_sse4.c
+++ b/vpx_dsp/x86/blend_a64_mask_sse4.c
@@ -26,12 +26,11 @@
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_a64_mask_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -54,12 +53,11 @@ static void blend_a64_mask_w4_sse4_1(
   } while (--h);
 }
 
-static void blend_a64_mask_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -83,11 +81,9 @@ static void blend_a64_mask_w8_sse4_1(
 }
 
 static void blend_a64_mask_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -100,10 +96,9 @@ static void blend_a64_mask_w16n_sse4_1(
       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
 
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
-                                       v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
-                                       v_m0h_w, v_m1h_w);
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
@@ -121,13 +116,11 @@ static void blend_a64_mask_w16n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static void blend_a64_mask_sx_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -153,13 +146,11 @@ static void blend_a64_mask_sx_w4_sse4_1(
 }
 
 static void blend_a64_mask_sx_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -185,13 +176,11 @@ static void blend_a64_mask_sx_w8_sse4_1(
 }
 
 static void blend_a64_mask_sx_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -207,10 +196,9 @@ static void blend_a64_mask_sx_w16n_sse4_1(
       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
 
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
-                                       v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
-                                       v_m0h_w, v_m1h_w);
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
@@ -228,11 +216,9 @@ static void blend_a64_mask_sx_w16n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static void blend_a64_mask_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -259,11 +245,9 @@ static void blend_a64_mask_sy_w4_sse4_1(
 }
 
 static void blend_a64_mask_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -290,11 +274,9 @@ static void blend_a64_mask_sy_w8_sse4_1(
 }
 
 static void blend_a64_mask_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   const __m128i v_zero = _mm_setzero_si128();
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
@@ -310,10 +292,9 @@ static void blend_a64_mask_sy_w16n_sse4_1(
       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
 
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
-                                       v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
-                                       v_m0h_w, v_m1h_w);
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
@@ -331,13 +312,11 @@ static void blend_a64_mask_sy_w16n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static void blend_a64_mask_sx_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -347,8 +326,8 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
-                                           v_zmask_b);
+    const __m128i v_rvsb_w =
+        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
@@ -368,13 +347,11 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
 }
 
 static void blend_a64_mask_sx_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -384,8 +361,8 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
     const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
-                                           v_zmask_b);
+    const __m128i v_rvsb_w =
+        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
@@ -405,13 +382,11 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
 }
 
 static void blend_a64_mask_sx_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -425,10 +400,10 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
       const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
       const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
       const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
-      const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1),
-                                              v_zmask_b);
-      const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1),
-                                              v_zmask_b);
+      const __m128i v_rvsbl_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
+      const __m128i v_rvsbh_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
       const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
       const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
 
@@ -437,10 +412,9 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
 
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
-                                       v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
-                                       v_m0h_w, v_m1h_w);
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
@@ -457,30 +431,27 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
 
-void vpx_blend_a64_mask_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w, int suby, int subx) {
-  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
-                            const uint8_t *src0, uint32_t src0_stride,
-                            const uint8_t *src1, uint32_t src1_stride,
-                            const uint8_t *mask, uint32_t mask_stride,
-                            int h, int w);
+void vpx_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                               const uint8_t *src0, uint32_t src0_stride,
+                               const uint8_t *src1, uint32_t src1_stride,
+                               const uint8_t *mask, uint32_t mask_stride, int h,
+                               int w, int suby, int subx) {
+  typedef void (*blend_fn)(
+      uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int h, int w);
 
   // Dimensions are: width_index X subx X suby
   static const blend_fn blend[3][2][2] = {
-    {     // w % 16 == 0
-      {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1},
-      {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1}
-    }, {  // w == 4
-      {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1},
-      {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1}
-    }, {  // w == 8
-      {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1},
-      {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1}
-    }
+    { // w % 16 == 0
+      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
+      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
+    { // w == 4
+      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
+      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
+    { // w == 8
+      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
+      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
   };
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -492,17 +463,12 @@ void vpx_blend_a64_mask_sse4_1(
   assert(IS_POWER_OF_TWO(w));
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    vpx_blend_a64_mask_c(dst, dst_stride,
-                         src0, src0_stride,
-                         src1, src1_stride,
-                         mask, mask_stride,
-                         h, w, suby, subx);
+    vpx_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, h, w, suby, subx);
   } else {
-    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
-                                              src0, src0_stride,
-                                              src1, src1_stride,
-                                              mask, mask_stride,
-                                              h, w);
+    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
+                                              src0_stride, src1, src1_stride,
+                                              mask, mask_stride, h, w);
   }
 }
 
@@ -512,11 +478,9 @@ void vpx_blend_a64_mask_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE void blend_a64_mask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, blend_unit_fn blend) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -536,35 +500,28 @@ static INLINE void blend_a64_mask_bn_w4_sse4_1(
 }
 
 static void blend_a64_mask_b10_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b10);
+                              src1_stride, mask, mask_stride, h, blend_4_b10);
 }
 
 static void blend_a64_mask_b12_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b12);
+                              src1_stride, mask, mask_stride, h, blend_4_b12);
 }
 
 static INLINE void blend_a64_mask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w, blend_unit_fn blend) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -586,22 +543,18 @@ static INLINE void blend_a64_mask_bn_w8n_sse4_1(
 }
 
 static void blend_a64_mask_b10_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, mask_stride, h, w,
                                blend_8_b10);
 }
 
 static void blend_a64_mask_b12_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, mask_stride, h, w,
                                blend_8_b12);
@@ -612,13 +565,11 @@ static void blend_a64_mask_b12_w8n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -640,11 +591,9 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
 }
 
 static void blend_a64_mask_b10_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -652,25 +601,22 @@ static void blend_a64_mask_b10_sx_w4_sse4_1(
 }
 
 static void blend_a64_mask_b12_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
-  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
                                  blend_4_b12);
 }
 
 static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -694,22 +640,18 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
 }
 
 static void blend_a64_mask_b10_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                   src1_stride, mask, mask_stride, h, w,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                   src1_stride, mask, mask_stride, h, w,
                                   blend_8_b12);
@@ -720,11 +662,9 @@ static void blend_a64_mask_b12_sx_w8n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, blend_unit_fn blend) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -747,11 +687,9 @@ static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
 }
 
 static void blend_a64_mask_b10_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -759,11 +697,9 @@ static void blend_a64_mask_b10_sy_w4_sse4_1(
 }
 
 static void blend_a64_mask_b12_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -771,11 +707,10 @@ static void blend_a64_mask_b12_sy_w4_sse4_1(
 }
 
 static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w, blend_unit_fn blend) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -800,22 +735,18 @@ static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
 }
 
 static void blend_a64_mask_b10_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                   src1_stride, mask, mask_stride, h, w,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                   src1_stride, mask, mask_stride, h, w,
                                   blend_8_b12);
@@ -826,13 +757,11 @@ static void blend_a64_mask_b12_sy_w8n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -840,8 +769,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
     const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
-                                           v_zmask_b);
+    const __m128i v_rvsb_w =
+        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
     const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
@@ -859,11 +788,9 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
 }
 
 static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -871,11 +798,9 @@ static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
 }
 
 static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -883,13 +808,12 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
 }
 
 static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
-                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -899,8 +823,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
       const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
       const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
       const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-      const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
-                                             v_zmask_b);
+      const __m128i v_rvsb_w =
+          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
       const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
       const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
@@ -918,22 +842,18 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
 }
 
 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                      src1_stride, mask, mask_stride, h, w,
                                      blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w) {
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                      src1_stride, mask, mask_stride, h, w,
                                      blend_8_b12);
@@ -943,38 +863,38 @@ static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
 
-void vpx_highbd_blend_a64_mask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride,
-    const uint8_t *src0_8, uint32_t src0_stride,
-    const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride,
-    int h, int w, int suby, int subx, int bd) {
-  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
-                            const uint16_t *src0, uint32_t src0_stride,
-                            const uint16_t *src1, uint32_t src1_stride,
-                            const uint8_t *mask, uint32_t mask_stride,
-                            int h, int w);
+void vpx_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+                                      const uint8_t *src0_8,
+                                      uint32_t src0_stride,
+                                      const uint8_t *src1_8,
+                                      uint32_t src1_stride, const uint8_t *mask,
+                                      uint32_t mask_stride, int h, int w,
+                                      int suby, int subx, int bd) {
+  typedef void (*blend_fn)(
+      uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int h, int w);
 
   // Dimensions are: bd_index X width_index X subx X suby
   static const blend_fn blend[2][2][2][2] = {
     {   // bd == 8 or 10
-      {     // w % 8 == 0
-        {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1},
-        {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1}
-      }, {  // w == 4
-        {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1},
-        {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1}
-      }
-    },
+      { // w % 8 == 0
+        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
+        { blend_a64_mask_b10_sx_w8n_sse4_1,
+          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
+        { blend_a64_mask_b10_sx_w4_sse4_1,
+          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
     {   // bd == 12
-      {     // w % 8 == 0
-        {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1},
-        {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1}
-      }, {  // w == 4
-        {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1},
-        {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1}
-      }
-    }
+      { // w % 8 == 0
+        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
+        { blend_a64_mask_b12_sx_w8n_sse4_1,
+          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
+        { blend_a64_mask_b12_sx_w4_sse4_1,
+          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
   };
 
   assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
@@ -987,21 +907,17 @@ void vpx_highbd_blend_a64_mask_sse4_1(
 
   assert(bd == 8 || bd == 10 || bd == 12);
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    vpx_highbd_blend_a64_mask_c(dst_8, dst_stride,
-                                src0_8, src0_stride,
-                                src1_8, src1_stride,
-                                mask, mask_stride,
-                                h, w, suby, subx, bd);
+    vpx_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                src1_stride, mask, mask_stride, h, w, suby,
+                                subx, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
-    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
-                                                        src0, src0_stride,
-                                                        src1, src1_stride,
-                                                        mask, mask_stride,
-                                                        h, w);
+    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+        mask_stride, h, w);
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_a64_vmask_sse4.c b/vpx_dsp/x86/blend_a64_vmask_sse4.c
index 85842e111935fec99f61da70d2b11d5a730f89f0..e7fe1bba150fe5bafdc95cdc30f4f27351416667 100644
--- a/vpx_dsp/x86/blend_a64_vmask_sse4.c
+++ b/vpx_dsp/x86/blend_a64_vmask_sse4.c
@@ -26,11 +26,10 @@
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_a64_vmask_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -52,11 +51,10 @@ static void blend_a64_vmask_w4_sse4_1(
   } while (--h);
 }
 
-static void blend_a64_vmask_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -78,11 +76,12 @@ static void blend_a64_vmask_w8_sse4_1(
   } while (--h);
 }
 
-static void blend_a64_vmask_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                        const uint8_t *src0,
+                                        uint32_t src0_stride,
+                                        const uint8_t *src1,
+                                        uint32_t src1_stride,
+                                        const uint8_t *mask, int h, int w) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -90,10 +89,9 @@ static void blend_a64_vmask_w16n_sse4_1(
     const __m128i v_m0_w = _mm_set1_epi16(*mask);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
     for (c = 0; c < w; c += 16) {
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
-                                       v_m0_w, v_m1_w);
-      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
-                                       v_m0_w, v_m1_w);
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
@@ -110,15 +108,14 @@ static void blend_a64_vmask_w16n_sse4_1(
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
 
-void vpx_blend_a64_vmask_sse4_1(
-    uint8_t *dst, uint32_t dst_stride,
-    const uint8_t *src0, uint32_t src0_stride,
-    const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
-  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
-                            const uint8_t *src0, uint32_t src0_stride,
-                            const uint8_t *src1, uint32_t src1_stride,
-                            const uint8_t *mask, int h, int w);
+void vpx_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int h, int w) {
+  typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int h, int w);
 
   // Dimension: width_index
   static const blend_fn blend[9] = {
@@ -141,10 +138,8 @@ void vpx_blend_a64_vmask_sse4_1(
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  blend[w & 0xf](dst, dst_stride,
-                 src0, src0_stride,
-                 src1, src1_stride,
-                 mask, h, w);
+  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h,
+                 w);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -153,9 +148,8 @@ void vpx_blend_a64_vmask_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE void blend_a64_vmask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, int h, blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
@@ -174,32 +168,31 @@ static INLINE void blend_a64_vmask_bn_w4_sse4_1(
   } while (--h);
 }
 
-static void blend_a64_vmask_b10_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int h, int w) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h,
-                               blend_4_b10);
+                               src1_stride, mask, h, blend_4_b10);
 }
 
-static void blend_a64_vmask_b12_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int h, int w) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h,
-                               blend_4_b12);
+                               src1_stride, mask, h, blend_4_b12);
 }
 
 static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, int h, int w, blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
@@ -219,24 +212,24 @@ static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
   } while (--h);
 }
 
-static void blend_a64_vmask_b10_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int h, int w) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w,
-                                blend_8_b10);
+                                src1_stride, mask, h, w, blend_8_b10);
 }
 
-static void blend_a64_vmask_b12_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride,
-    const uint16_t *src0, uint32_t src0_stride,
-    const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w) {
+static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int h, int w) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w,
-                                blend_8_b12);
+                                src1_stride, mask, h, w, blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -244,23 +237,25 @@ static void blend_a64_vmask_b12_w8n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 
 void vpx_highbd_blend_a64_vmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride,
-    const uint8_t *src0_8, uint32_t src0_stride,
-    const uint8_t *src1_8, uint32_t src1_stride,
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
     const uint8_t *mask, int h, int w, int bd) {
-  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
-                            const uint16_t *src0, uint32_t src0_stride,
-                            const uint16_t *src1, uint32_t src1_stride,
-                            const uint8_t *mask, int h, int w);
+  typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+                           const uint16_t *src0, uint32_t src0_stride,
+                           const uint16_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int h, int w);
 
   // Dimensions are: bd_index X width_index
   static const blend_fn blend[2][2] = {
-    {     // bd == 8 or 10
-      blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
-      blend_a64_vmask_b10_w4_sse4_1,   // w == 4
-    }, {  // bd == 12
-      blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
-      blend_a64_vmask_b12_w4_sse4_1,   // w == 4
+    {
+        // bd == 8 or 10
+        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
+    },
+    {
+        // bd == 12
+        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
     }
   };
 
@@ -275,19 +270,15 @@ void vpx_highbd_blend_a64_vmask_sse4_1(
   assert(bd == 8 || bd == 10 || bd == 12);
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride,
-                                 src0_8, src0_stride,
-                                 src1_8, src1_stride,
-                                 mask, h, w, bd);
+    vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, h, w, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
-    blend[bd == 12][(w >> 2) & 1](dst, dst_stride,
-                                  src0, src0_stride,
-                                  src1, src1_stride,
-                                  mask, h, w);
+    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, h, w);
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_sse4.h b/vpx_dsp/x86/blend_sse4.h
index 9b74f905442d05f58447493dd39bdcb837914d44..e3b031931541f49bea2bec4336c7f5676c4172d0 100644
--- a/vpx_dsp/x86/blend_sse4.h
+++ b/vpx_dsp/x86/blend_sse4.h
@@ -99,8 +99,8 @@ static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
   const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
 
   // Scale
-  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d,
-                                          VPX_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssum_d =
+      _mm_srli_epi32(v_sum_d, VPX_BLEND_A64_ROUND_BITS - 1);
 
   // Pack
   const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
@@ -127,10 +127,10 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
   const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
 
   // Scale
-  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d,
-                                           VPX_BLEND_A64_ROUND_BITS - 1);
-  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d,
-                                           VPX_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssuml_d =
+      _mm_srli_epi32(v_suml_d, VPX_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssumh_d =
+      _mm_srli_epi32(v_sumh_d, VPX_BLEND_A64_ROUND_BITS - 1);
 
   // Pack
   const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index ab387d664d3828fe5fcde52ecb183330959e2310..ae1089e38a605c2f64183e04bd5987b3644cba45 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -17,272 +17,186 @@
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_convolve.h"
 
-typedef void filter8_1dfunction (
-  const uint8_t *src_ptr,
-  ptrdiff_t src_pitch,
-  uint8_t *output_ptr,
-  ptrdiff_t out_pitch,
-  uint32_t output_height,
-  const int16_t *filter
-);
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                uint32_t output_height, const int16_t *filter);
 
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                    uint8_t *dst, ptrdiff_t dst_stride, \
-                                    const int16_t *filter_x, int x_step_q4, \
-                                    const int16_t *filter_y, int y_step_q4, \
-                                    int w, int h) { \
-  assert(filter[3] != 128); \
-  assert(step_q4 == 16); \
-  if (filter[0] | filter[1] | filter[2]) { \
-    while (w >= 16) { \
-      vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                               src_stride, \
-                                               dst, \
-                                               dst_stride, \
-                                               h, \
-                                               filter); \
-      src += 16; \
-      dst += 16; \
-      w -= 16; \
-    } \
-    if (w == 8) { \
-      vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } else if (w == 4) { \
-      vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } \
-  } else { \
-    while (w >= 16) { \
-      vpx_filter_block1d16_##dir##2_##avg##opt(src, \
-                                               src_stride, \
-                                               dst, \
-                                               dst_stride, \
-                                               h, \
-                                               filter); \
-      src += 16; \
-      dst += 16; \
-      w -= 16; \
-    } \
-    if (w == 8) { \
-      vpx_filter_block1d8_##dir##2_##avg##opt(src, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } else if (w == 4) { \
-      vpx_filter_block1d4_##dir##2_##avg##opt(src, \
-                                              src_stride, \
-                                              dst, \
-                                              dst_stride, \
-                                              h, \
-                                              filter); \
-    } \
-  } \
-}
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+  void vpx_convolve8_##name##_##opt(                                         \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    assert(filter[3] != 128);                                                \
+    assert(step_q4 == 16);                                                   \
+    if (filter[0] | filter[1] | filter[2]) {                                 \
+      while (w >= 16) {                                                      \
+        vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      if (w == 8) {                                                          \
+        vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+      } else if (w == 4) {                                                   \
+        vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+      }                                                                      \
+    } else {                                                                 \
+      while (w >= 16) {                                                      \
+        vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      if (w == 8) {                                                          \
+        vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+      } else if (w == 4) {                                                   \
+        vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+      }                                                                      \
+    }                                                                        \
+  }
 
-#define FUN_CONV_2D(avg, opt) \
-void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                              uint8_t *dst, ptrdiff_t dst_stride, \
-                              const int16_t *filter_x, int x_step_q4, \
-                              const int16_t *filter_y, int y_step_q4, \
-                              int w, int h) { \
-  assert(filter_x[3] != 128); \
-  assert(filter_y[3] != 128); \
-  assert(w <= MAX_SB_SIZE); \
-  assert(h <= MAX_SB_SIZE); \
-  assert(x_step_q4 == 16); \
-  assert(y_step_q4 == 16); \
-  if (filter_x[0] || filter_x[1] || filter_x[2]|| \
-      filter_y[0] || filter_y[1] || filter_y[2]) { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \
-    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                              fdata2, MAX_SB_SIZE, \
-                              filter_x, x_step_q4, filter_y, y_step_q4, \
-                              w, h + 7); \
-    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                    dst, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, \
-                                    y_step_q4, w, h); \
-  } else { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \
-    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \
-                              filter_x, x_step_q4, filter_y, y_step_q4, \
-                              w, h + 1); \
-    vpx_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, \
-                                    y_step_q4, w, h); \
-  } \
-}
+#define FUN_CONV_2D(avg, opt)                                                \
+  void vpx_convolve8_##avg##opt(                                             \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    assert(filter_x[3] != 128);                                              \
+    assert(filter_y[3] != 128);                                              \
+    assert(w <= MAX_SB_SIZE);                                                \
+    assert(h <= MAX_SB_SIZE);                                                \
+    assert(x_step_q4 == 16);                                                 \
+    assert(y_step_q4 == 16);                                                 \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] ||          \
+        filter_y[1] || filter_y[2]) {                                        \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
+      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2,    \
+                                MAX_SB_SIZE, filter_x, x_step_q4, filter_y,  \
+                                y_step_q4, w, h + 7);                        \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
+                                      dst, dst_stride, filter_x, x_step_q4,  \
+                                      filter_y, y_step_q4, w, h);            \
+    } else {                                                                 \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \
+      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE,        \
+                                filter_x, x_step_q4, filter_y, y_step_q4, w, \
+                                h + 1);                                      \
+      vpx_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride,  \
+                                      filter_x, x_step_q4, filter_y,         \
+                                      y_step_q4, w, h);                      \
+    }                                                                        \
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 
-typedef void highbd_filter8_1dfunction (
-  const uint16_t *src_ptr,
-  const ptrdiff_t src_pitch,
-  uint16_t *output_ptr,
-  ptrdiff_t out_pitch,
-  unsigned int output_height,
-  const int16_t *filter,
-  int bd
-);
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+                                       const ptrdiff_t src_pitch,
+                                       uint16_t *output_ptr,
+                                       ptrdiff_t out_pitch,
+                                       unsigned int output_height,
+                                       const int16_t *filter, int bd);
 
 #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
-                                           ptrdiff_t src_stride, \
-                                           uint8_t *dst8, \
-                                           ptrdiff_t dst_stride, \
-                                           const int16_t *filter_x, \
-                                           int x_step_q4, \
-                                           const int16_t *filter_y, \
-                                           int y_step_q4, \
-                                           int w, int h, int bd) { \
-  if (step_q4 == 16 && filter[3] != 128) { \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-    if (filter[0] | filter[1] | filter[2]) { \
-      while (w >= 16) { \
-        vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } else { \
-      while (w >= 16) { \
-        vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } \
-  } \
-  if (w) { \
-    vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd); \
-  } \
-}
+  void vpx_highbd_convolve8_##name##_##opt(                               \
+      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,           \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \
+      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \
+    if (step_q4 == 16 && filter[3] != 128) {                              \
+      uint16_t *src = CONVERT_TO_SHORTPTR(src8);                          \
+      uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                          \
+      if (filter[0] | filter[1] | filter[2]) {                            \
+        while (w >= 16) {                                                 \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
+          src += 16;                                                      \
+          dst += 16;                                                      \
+          w -= 16;                                                        \
+        }                                                                 \
+        while (w >= 8) {                                                  \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
+          src += 8;                                                       \
+          dst += 8;                                                       \
+          w -= 8;                                                         \
+        }                                                                 \
+        while (w >= 4) {                                                  \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
+          src += 4;                                                       \
+          dst += 4;                                                       \
+          w -= 4;                                                         \
+        }                                                                 \
+      } else {                                                            \
+        while (w >= 16) {                                                 \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                \
+              src, src_stride, dst, dst_stride, h, filter, bd);           \
+          src += 16;                                                      \
+          dst += 16;                                                      \
+          w -= 16;                                                        \
+        }                                                                 \
+        while (w >= 8) {                                                  \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter, bd);           \
+          src += 8;                                                       \
+          dst += 8;                                                       \
+          w -= 8;                                                         \
+        }                                                                 \
+        while (w >= 4) {                                                  \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter, bd);           \
+          src += 4;                                                       \
+          dst += 4;                                                       \
+          w -= 4;                                                         \
+        }                                                                 \
+      }                                                                   \
+    }                                                                     \
+    if (w) {                                                              \
+      vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+                                      filter_x, x_step_q4, filter_y,      \
+                                      y_step_q4, w, h, bd);               \
+    }                                                                     \
+  }
 
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                     uint8_t *dst, ptrdiff_t dst_stride, \
-                                     const int16_t *filter_x, int x_step_q4, \
-                                     const int16_t *filter_y, int y_step_q4, \
-                                     int w, int h, int bd) { \
-  assert(w <= MAX_SB_SIZE); \
-  assert(h <= MAX_SB_SIZE); \
-  if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
-        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \
-      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \
-                                       src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), \
-                                       MAX_SB_SIZE, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 7, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt( \
-        CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, \
-        MAX_SB_SIZE, \
-        dst, \
-        dst_stride, \
-        filter_x, x_step_q4, \
-        filter_y, y_step_q4, \
-        w, h, bd); \
-    } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \
-      vpx_highbd_convolve8_horiz_##opt(src, \
-                                       src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), \
-                                       MAX_SB_SIZE, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 1, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \
-                                             MAX_SB_SIZE, \
-                                             dst, \
-                                             dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
-    } \
-  } else { \
-    vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                  h, bd); \
-  } \
-}
+#define HIGH_FUN_CONV_2D(avg, opt)                                            \
+  void vpx_highbd_convolve8_##avg##opt(                                       \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
+      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
+    assert(w <= MAX_SB_SIZE);                                                 \
+    assert(h <= MAX_SB_SIZE);                                                 \
+    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
+      if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 ||  \
+          filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) {  \
+        DECLARE_ALIGNED(16, uint16_t,                                         \
+                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);             \
+        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \
+                                         CONVERT_TO_BYTEPTR(fdata2),          \
+                                         MAX_SB_SIZE, filter_x, x_step_q4,    \
+                                         filter_y, y_step_q4, w, h + 7, bd);  \
+        vpx_highbd_convolve8_##avg##vert_##opt(                               \
+            CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst,   \
+            dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);  \
+      } else {                                                                \
+        DECLARE_ALIGNED(16, uint16_t,                                         \
+                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]);             \
+        vpx_highbd_convolve8_horiz_##opt(                                     \
+            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE,         \
+            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd);          \
+        vpx_highbd_convolve8_##avg##vert_##opt(                               \
+            CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride,         \
+            filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);              \
+      }                                                                       \
+    } else {                                                                  \
+      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h, bd);                                \
+    }                                                                         \
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #endif  // VPX_DSP_X86_CONVOLVE_H_
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 951af3a622dbeb24ed90bdada0e22e9b259c4217..39d3a3f59c1c4c0759b39abd1b008a96a808757d 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -13,15 +13,15 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 
-#define pair256_set_epi16(a, b) \
+#define pair256_set_epi16(a, b)                                            \
   _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                    (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                    (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                    (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
-#define pair256_set_epi32(a, b) \
-  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \
-                   (int)(b), (int)(a), (int)(b), (int)(a))
+#define pair256_set_epi32(a, b)                                                \
+  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
+                   (int)(b), (int)(a))
 
 #if FDCT32x32_HIGH_PRECISION
 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
@@ -40,8 +40,7 @@ static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
 }
 #endif
 
-void FDCT32x32_2D_AVX2(const int16_t *input,
-                  int16_t *output_org, int stride) {
+void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   // Calculate pre-multiplied strides
   const int str1 = stride;
   const int str2 = 2 * stride;
@@ -53,43 +52,45 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
   const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m256i k__cospi_p16_m16 =
+      pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
   const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64,  cospi_8_64);
-  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64,  cospi_20_64);
-  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64,  cospi_12_64);
-  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64,   cospi_28_64);
-  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
+  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
+  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
   const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64,  cospi_2_64);
-  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64,  cospi_18_64);
-  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64,  cospi_10_64);
-  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64,   cospi_26_64);
-  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64,  cospi_6_64);
-  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64,  cospi_22_64);
-  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64,  cospi_14_64);
-  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64,   cospi_30_64);
-  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64,  cospi_1_64);
-  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64,  cospi_17_64);
-  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64,  cospi_9_64);
-  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64,   cospi_25_64);
-  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64,  cospi_7_64);
-  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64,   cospi_23_64);
-  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64,  cospi_15_64);
-  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64,   cospi_31_64);
-  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64,  cospi_5_64);
-  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64,  cospi_21_64);
-  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64,  cospi_13_64);
-  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64,   cospi_29_64);
-  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64,  cospi_3_64);
-  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64,  cospi_19_64);
-  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64,  cospi_11_64);
-  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m256i k__cospi_m12_m20 =
+      pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
+  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
+  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
+  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
+  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
+  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
+  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
+  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
+  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
+  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
+  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
+  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
+  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
   const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
   const __m256i kZero = _mm256_set1_epi16(0);
-  const __m256i kOne  = _mm256_set1_epi16(1);
+  const __m256i kOne = _mm256_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
   for (pass = 0; pass < 2; ++pass) {
@@ -104,125 +105,149 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
       // Note: even though all the loads below are aligned, using the aligned
       //       intrinsic make the code slightly slower.
       if (0 == pass) {
-        const int16_t *in  = &input[column_start];
+        const int16_t *in = &input[column_start];
         // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
         // Note: the next four blocks could be in a loop. That would help the
         //       instruction cache but is actually slower.
         {
-          const int16_t *ina =  in +  0 * str1;
-          const int16_t *inb =  in + 31 * str1;
-          __m256i *step1a = &step1[ 0];
+          const int16_t *ina = in + 0 * str1;
+          const int16_t *inb = in + 31 * str1;
+          __m256i *step1a = &step1[0];
           __m256i *step1b = &step1[31];
-          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1 =
+              _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2 =
+              _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3 =
+              _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3 =
+              _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2 =
+              _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1 =
+              _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[0] = _mm256_add_epi16(ina0, inb0);
+          step1a[1] = _mm256_add_epi16(ina1, inb1);
+          step1a[2] = _mm256_add_epi16(ina2, inb2);
+          step1a[3] = _mm256_add_epi16(ina3, inb3);
           step1b[-3] = _mm256_sub_epi16(ina3, inb3);
           step1b[-2] = _mm256_sub_epi16(ina2, inb2);
           step1b[-1] = _mm256_sub_epi16(ina1, inb1);
           step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
           step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
         }
         {
-          const int16_t *ina =  in +  4 * str1;
-          const int16_t *inb =  in + 27 * str1;
-          __m256i *step1a = &step1[ 4];
+          const int16_t *ina = in + 4 * str1;
+          const int16_t *inb = in + 27 * str1;
+          __m256i *step1a = &step1[4];
           __m256i *step1b = &step1[27];
-          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1 =
+              _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2 =
+              _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3 =
+              _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3 =
+              _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2 =
+              _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1 =
+              _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[0] = _mm256_add_epi16(ina0, inb0);
+          step1a[1] = _mm256_add_epi16(ina1, inb1);
+          step1a[2] = _mm256_add_epi16(ina2, inb2);
+          step1a[3] = _mm256_add_epi16(ina3, inb3);
           step1b[-3] = _mm256_sub_epi16(ina3, inb3);
           step1b[-2] = _mm256_sub_epi16(ina2, inb2);
           step1b[-1] = _mm256_sub_epi16(ina1, inb1);
           step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
           step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
         }
         {
-          const int16_t *ina =  in +  8 * str1;
-          const int16_t *inb =  in + 23 * str1;
-          __m256i *step1a = &step1[ 8];
+          const int16_t *ina = in + 8 * str1;
+          const int16_t *inb = in + 23 * str1;
+          __m256i *step1a = &step1[8];
           __m256i *step1b = &step1[23];
-          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1 =
+              _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2 =
+              _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3 =
+              _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3 =
+              _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2 =
+              _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1 =
+              _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[0] = _mm256_add_epi16(ina0, inb0);
+          step1a[1] = _mm256_add_epi16(ina1, inb1);
+          step1a[2] = _mm256_add_epi16(ina2, inb2);
+          step1a[3] = _mm256_add_epi16(ina3, inb3);
           step1b[-3] = _mm256_sub_epi16(ina3, inb3);
           step1b[-2] = _mm256_sub_epi16(ina2, inb2);
           step1b[-1] = _mm256_sub_epi16(ina1, inb1);
           step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
           step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
         }
         {
-          const int16_t *ina =  in + 12 * str1;
-          const int16_t *inb =  in + 19 * str1;
+          const int16_t *ina = in + 12 * str1;
+          const int16_t *inb = in + 19 * str1;
           __m256i *step1a = &step1[12];
           __m256i *step1b = &step1[19];
-          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1 =
+              _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2 =
+              _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3 =
+              _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3 =
+              _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2 =
+              _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1 =
+              _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[0] = _mm256_add_epi16(ina0, inb0);
+          step1a[1] = _mm256_add_epi16(ina1, inb1);
+          step1a[2] = _mm256_add_epi16(ina2, inb2);
+          step1a[3] = _mm256_add_epi16(ina3, inb3);
           step1b[-3] = _mm256_sub_epi16(ina3, inb3);
           step1b[-2] = _mm256_sub_epi16(ina2, inb2);
           step1b[-1] = _mm256_sub_epi16(ina1, inb1);
           step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
@@ -237,52 +262,52 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         // Note: the next four blocks could be in a loop. That would help the
         //       instruction cache but is actually slower.
         {
-          __m256i in00  = _mm256_loadu_si256((const __m256i *)(in +  0 * 32));
-          __m256i in01  = _mm256_loadu_si256((const __m256i *)(in +  1 * 32));
-          __m256i in02  = _mm256_loadu_si256((const __m256i *)(in +  2 * 32));
-          __m256i in03  = _mm256_loadu_si256((const __m256i *)(in +  3 * 32));
-          __m256i in28  = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
-          __m256i in29  = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
-          __m256i in30  = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
-          __m256i in31  = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
-          step1[ 0] = _mm256_add_epi16(in00, in31);
-          step1[ 1] = _mm256_add_epi16(in01, in30);
-          step1[ 2] = _mm256_add_epi16(in02, in29);
-          step1[ 3] = _mm256_add_epi16(in03, in28);
+          __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
+          __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
+          __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
+          __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
+          __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+          __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+          __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+          __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+          step1[0] = _mm256_add_epi16(in00, in31);
+          step1[1] = _mm256_add_epi16(in01, in30);
+          step1[2] = _mm256_add_epi16(in02, in29);
+          step1[3] = _mm256_add_epi16(in03, in28);
           step1[28] = _mm256_sub_epi16(in03, in28);
           step1[29] = _mm256_sub_epi16(in02, in29);
           step1[30] = _mm256_sub_epi16(in01, in30);
           step1[31] = _mm256_sub_epi16(in00, in31);
         }
         {
-          __m256i in04  = _mm256_loadu_si256((const __m256i *)(in +  4 * 32));
-          __m256i in05  = _mm256_loadu_si256((const __m256i *)(in +  5 * 32));
-          __m256i in06  = _mm256_loadu_si256((const __m256i *)(in +  6 * 32));
-          __m256i in07  = _mm256_loadu_si256((const __m256i *)(in +  7 * 32));
-          __m256i in24  = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
-          __m256i in25  = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
-          __m256i in26  = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
-          __m256i in27  = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
-          step1[ 4] = _mm256_add_epi16(in04, in27);
-          step1[ 5] = _mm256_add_epi16(in05, in26);
-          step1[ 6] = _mm256_add_epi16(in06, in25);
-          step1[ 7] = _mm256_add_epi16(in07, in24);
+          __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
+          __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
+          __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
+          __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
+          __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+          __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+          __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+          __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+          step1[4] = _mm256_add_epi16(in04, in27);
+          step1[5] = _mm256_add_epi16(in05, in26);
+          step1[6] = _mm256_add_epi16(in06, in25);
+          step1[7] = _mm256_add_epi16(in07, in24);
           step1[24] = _mm256_sub_epi16(in07, in24);
           step1[25] = _mm256_sub_epi16(in06, in25);
           step1[26] = _mm256_sub_epi16(in05, in26);
           step1[27] = _mm256_sub_epi16(in04, in27);
         }
         {
-          __m256i in08  = _mm256_loadu_si256((const __m256i *)(in +  8 * 32));
-          __m256i in09  = _mm256_loadu_si256((const __m256i *)(in +  9 * 32));
-          __m256i in10  = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
-          __m256i in11  = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
-          __m256i in20  = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
-          __m256i in21  = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
-          __m256i in22  = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
-          __m256i in23  = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
-          step1[ 8] = _mm256_add_epi16(in08, in23);
-          step1[ 9] = _mm256_add_epi16(in09, in22);
+          __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
+          __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
+          __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+          __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+          __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+          __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+          __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+          __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+          step1[8] = _mm256_add_epi16(in08, in23);
+          step1[9] = _mm256_add_epi16(in09, in22);
           step1[10] = _mm256_add_epi16(in10, in21);
           step1[11] = _mm256_add_epi16(in11, in20);
           step1[20] = _mm256_sub_epi16(in11, in20);
@@ -291,14 +316,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           step1[23] = _mm256_sub_epi16(in08, in23);
         }
         {
-          __m256i in12  = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
-          __m256i in13  = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
-          __m256i in14  = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
-          __m256i in15  = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
-          __m256i in16  = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
-          __m256i in17  = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
-          __m256i in18  = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
-          __m256i in19  = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+          __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+          __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+          __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+          __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+          __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+          __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+          __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+          __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
           step1[12] = _mm256_add_epi16(in12, in19);
           step1[13] = _mm256_add_epi16(in13, in18);
           step1[14] = _mm256_add_epi16(in14, in17);
@@ -311,16 +336,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
       }
       // Stage 2
       {
-        step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
-        step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
-        step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
-        step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
-        step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
-        step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
-        step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
-        step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
-        step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
-        step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+        step2[0] = _mm256_add_epi16(step1[0], step1[15]);
+        step2[1] = _mm256_add_epi16(step1[1], step1[14]);
+        step2[2] = _mm256_add_epi16(step1[2], step1[13]);
+        step2[3] = _mm256_add_epi16(step1[3], step1[12]);
+        step2[4] = _mm256_add_epi16(step1[4], step1[11]);
+        step2[5] = _mm256_add_epi16(step1[5], step1[10]);
+        step2[6] = _mm256_add_epi16(step1[6], step1[9]);
+        step2[7] = _mm256_add_epi16(step1[7], step1[8]);
+        step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
+        step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
         step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
         step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
         step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
@@ -354,22 +379,38 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
         const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
         // dct_const_round_shift
-        const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_4 =
+            _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_5 =
+            _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_4 =
+            _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_5 =
+            _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_4 =
+            _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_5 =
+            _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_4 =
+            _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_5 =
+            _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_4 =
+            _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_5 =
+            _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_4 =
+            _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_5 =
+            _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_4 =
+            _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_5 =
+            _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_4 =
+            _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_5 =
+            _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
         const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
         const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
         const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
@@ -401,49 +442,49 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
       // dump the magnitude by half, hence the intermediate values are within
       // the range of 16 bits.
       if (1 == pass) {
-        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
-        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
-        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
-        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
-        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
-        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
-        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
-        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
-        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
-        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
-        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
-        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
-        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
-        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
-        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
-        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
-        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
-        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
-        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
-        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
-        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
-        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
-        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
-        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
-        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
-        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
-        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
-        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
-        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
-        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
-        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
-        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
-
-        step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
-        step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
-        step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
-        step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
-        step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
-        step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
-        step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
-        step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
-        step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
-        step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
+        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
+        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
+        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
+        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
+        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
+        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
+        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
+        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
+        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
+        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
+        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
+        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
+        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
+        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
+        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
+        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
+        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
+        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
+        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
+        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
+        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
+        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
+        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
+        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
+        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
+        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
+        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
+        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
+        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
+        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
+        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
+
+        step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
+        step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
+        step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
+        step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
+        step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
+        step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
+        step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
+        step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
+        step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
+        step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
         step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
         step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
         step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
@@ -467,16 +508,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
         step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
 
-        step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
-        step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
-        step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
-        step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
-        step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
-        step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
-        step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
-        step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
-        step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
-        step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+        step2[0] = _mm256_add_epi16(step2[0], kOne);
+        step2[1] = _mm256_add_epi16(step2[1], kOne);
+        step2[2] = _mm256_add_epi16(step2[2], kOne);
+        step2[3] = _mm256_add_epi16(step2[3], kOne);
+        step2[4] = _mm256_add_epi16(step2[4], kOne);
+        step2[5] = _mm256_add_epi16(step2[5], kOne);
+        step2[6] = _mm256_add_epi16(step2[6], kOne);
+        step2[7] = _mm256_add_epi16(step2[7], kOne);
+        step2[8] = _mm256_add_epi16(step2[8], kOne);
+        step2[9] = _mm256_add_epi16(step2[9], kOne);
         step2[10] = _mm256_add_epi16(step2[10], kOne);
         step2[11] = _mm256_add_epi16(step2[11], kOne);
         step2[12] = _mm256_add_epi16(step2[12], kOne);
@@ -500,16 +541,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         step1[30] = _mm256_add_epi16(step1[30], kOne);
         step1[31] = _mm256_add_epi16(step1[31], kOne);
 
-        step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
-        step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
-        step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
-        step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
-        step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
-        step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
-        step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
-        step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
-        step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
-        step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+        step2[0] = _mm256_srai_epi16(step2[0], 2);
+        step2[1] = _mm256_srai_epi16(step2[1], 2);
+        step2[2] = _mm256_srai_epi16(step2[2], 2);
+        step2[3] = _mm256_srai_epi16(step2[3], 2);
+        step2[4] = _mm256_srai_epi16(step2[4], 2);
+        step2[5] = _mm256_srai_epi16(step2[5], 2);
+        step2[6] = _mm256_srai_epi16(step2[6], 2);
+        step2[7] = _mm256_srai_epi16(step2[7], 2);
+        step2[8] = _mm256_srai_epi16(step2[8], 2);
+        step2[9] = _mm256_srai_epi16(step2[9], 2);
         step2[10] = _mm256_srai_epi16(step2[10], 2);
         step2[11] = _mm256_srai_epi16(step2[11], 2);
         step2[12] = _mm256_srai_epi16(step2[12], 2);
@@ -538,616 +579,796 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
 #if FDCT32x32_HIGH_PRECISION
       if (pass == 0) {
 #endif
-      // Stage 3
-      {
-        step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
-        step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
-        step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
-        step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
-        step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
-        step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
-        step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
-        step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
-      }
-      {
-        const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
-        const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
-        const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
-        const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
-        const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
-        const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
-        const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
-        const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
-        const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
-        const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
-        const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
-        const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
-        const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
-        const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
-        const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
-        const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
-        const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
-        const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
-        const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        // Combine
-        step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
-        step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
-        step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
-        step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
-      }
-      {
-        step3[16] = _mm256_add_epi16(step2[23], step1[16]);
-        step3[17] = _mm256_add_epi16(step2[22], step1[17]);
-        step3[18] = _mm256_add_epi16(step2[21], step1[18]);
-        step3[19] = _mm256_add_epi16(step2[20], step1[19]);
-        step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
-        step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
-        step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
-        step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
-        step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
-        step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
-        step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
-        step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
-        step3[28] = _mm256_add_epi16(step2[27], step1[28]);
-        step3[29] = _mm256_add_epi16(step2[26], step1[29]);
-        step3[30] = _mm256_add_epi16(step2[25], step1[30]);
-        step3[31] = _mm256_add_epi16(step2[24], step1[31]);
-      }
+        // Stage 3
+        {
+          step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+          step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+          step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+          step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+          step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+          step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+          step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+          step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+        }
+        {
+          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m256i s3_10_4 =
+              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_5 =
+              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_4 =
+              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_5 =
+              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_4 =
+              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_5 =
+              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_4 =
+              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_5 =
+              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+          // Combine
+          step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+          step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+          step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+          step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+        }
+        {
+          step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+          step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+          step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+          step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+          step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+          step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+          step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+          step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+          step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+          step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+          step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+          step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+          step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+          step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+          step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+          step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+        }
 
-      // Stage 4
-      {
-        step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
-        step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
-        step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
-        step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
-        step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
-        step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
-        step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
-        step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
-        step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
-        step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
-        step1[14] = _mm256_add_epi16(step3[13], step2[14]);
-        step1[15] = _mm256_add_epi16(step3[12], step2[15]);
-      }
-      {
-        const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
-        const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
-        const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
-        const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
-        const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
-        const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
-        const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
-        const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
-        const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
-        // Combine
-        step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
-        step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
-      }
-      {
-        const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
-        const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
-        const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
-        const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
-        const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
-        const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
-        const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
-        const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
-        const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
-        const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
-        const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
-        const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
-        const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
-        const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
-        const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
-        const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
-        const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
-        const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
-        const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
-        const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
-        const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
-        const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
-        const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
-        const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
-        // dct_const_round_shift
-        const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-        const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-        const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
-        const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
-        const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
-        const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
-        const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
-        const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
-        const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
-        const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
-        const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
-        const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
-        const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
-        const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
-        const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
-        const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
-        const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
-        const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
-        // Combine
-        step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
-        step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
-        step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
-        step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
-        step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
-        step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
-        step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
-        step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
-      }
-      // Stage 5
-      {
-        step2[4] = _mm256_add_epi16(step1[5], step3[4]);
-        step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
-        step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
-        step2[7] = _mm256_add_epi16(step1[6], step3[7]);
-      }
-      {
-        const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
-        const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
-        const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
-        const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
-        const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
-        const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
-        const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
-        const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
-        const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
-        const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
-        const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
-        const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
-        // dct_const_round_shift
-        const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
-        const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
-        const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
-        const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
-        const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
-        const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
-        const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
-        const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
-        // Combine
-        out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
-        out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
-        out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
-        out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
-      }
-      {
-        const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
-        const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
-        const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
-        const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
-        const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
-        const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
-        const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
-        const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
-        const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
-        const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
-        const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
-        const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
-        // dct_const_round_shift
-        const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
-        const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
-        const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
-        const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
-        const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
-        const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
-        const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
-        const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
-        // Combine
-        step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
-        step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
-        step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
-        step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
-      }
-      {
-        step2[16] = _mm256_add_epi16(step1[19], step3[16]);
-        step2[17] = _mm256_add_epi16(step1[18], step3[17]);
-        step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
-        step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
-        step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
-        step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
-        step2[22] = _mm256_add_epi16(step1[21], step3[22]);
-        step2[23] = _mm256_add_epi16(step1[20], step3[23]);
-        step2[24] = _mm256_add_epi16(step1[27], step3[24]);
-        step2[25] = _mm256_add_epi16(step1[26], step3[25]);
-        step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
-        step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
-        step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
-        step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
-        step2[30] = _mm256_add_epi16(step1[29], step3[30]);
-        step2[31] = _mm256_add_epi16(step1[28], step3[31]);
-      }
-      // Stage 6
-      {
-        const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-        const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-        const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-        const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-        const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-        const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-        const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-        const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-        const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
-        const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
-        const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
-        const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
-        const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
-        const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
-        const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
-        const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
-        // dct_const_round_shift
-        const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
-        const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
-        const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
-        const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
-        const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
-        const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
-        const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
-        const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
-        // Combine
-        out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
-        out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
-        out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
-        out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
-      }
-      {
-        step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
-        step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
-        step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
-        step3[11] = _mm256_add_epi16(step2[10], step1[11]);
-        step3[12] = _mm256_add_epi16(step2[13], step1[12]);
-        step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
-        step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
-        step3[15] = _mm256_add_epi16(step2[14], step1[15]);
-      }
-      {
-        const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
-        const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
-        const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
-        const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
-        const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
-        const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
-        const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
-        const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
-        const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
-        const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
-        const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
-        const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
-        const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
-        const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
-        const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
-        const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
-        const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
-        const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
-        const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
-        const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
-        const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
-        const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
-        const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
-        const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
-        // dct_const_round_shift
-        const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
-        const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
-        const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
-        const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
-        const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
-        const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
-        const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
-        const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
-        const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-        const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-        const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
-        const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
-        const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
-        const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
-        const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
-        const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
-        const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
-        const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
-        // Combine
-        step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
-        step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
-        step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
-        step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
-        // Combine
-        step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
-        step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
-        step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
-        step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
-      }
-      // Stage 7
-      {
-        const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
-        const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
-        const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
-        const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
-        const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
-        const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
-        const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
-        const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
-        const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
-        const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
-        const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
-        const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
-        const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
-        const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
-        const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
-        const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
-        const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
-        const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
-        const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
-        const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
-        const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
-        const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
-        const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
-        const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
-        // dct_const_round_shift
-        const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
-        const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
-        const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
-        const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
-        const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
-        const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
-        const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
-        const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
-        const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
-        const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
-        const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
-        const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
-        const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
-        const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
-        const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
-        const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
-        // Combine
-        out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
-        out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
-        out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
-        out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
-        out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
-        out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
-        out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
-        out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
-      }
-      {
-        step1[16] = _mm256_add_epi16(step3[17], step2[16]);
-        step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
-        step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
-        step1[19] = _mm256_add_epi16(step3[18], step2[19]);
-        step1[20] = _mm256_add_epi16(step3[21], step2[20]);
-        step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
-        step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
-        step1[23] = _mm256_add_epi16(step3[22], step2[23]);
-        step1[24] = _mm256_add_epi16(step3[25], step2[24]);
-        step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
-        step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
-        step1[27] = _mm256_add_epi16(step3[26], step2[27]);
-        step1[28] = _mm256_add_epi16(step3[29], step2[28]);
-        step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
-        step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
-        step1[31] = _mm256_add_epi16(step3[30], step2[31]);
-      }
-      // Final stage --- outputs indices are bit-reversed.
-      {
-        const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
-        const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
-        const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
-        const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
-        const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
-        const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
-        const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
-        const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
-        const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
-        const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
-        const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
-        const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
-        const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
-        const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
-        const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
-        const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
-        const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
-        const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
-        const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
-        const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
-        const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
-        const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
-        const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
-        const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
-        // dct_const_round_shift
-        const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
-        const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
-        const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
-        const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
-        const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
-        const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
-        const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
-        const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
-        const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
-        const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
-        const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
-        const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
-        const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
-        const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
-        const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
-        const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
-        // Combine
-        out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
-        out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
-        out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
-        out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
-        out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
-        out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
-        out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
-        out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
-      }
-      {
-        const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
-        const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
-        const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
-        const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
-        const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
-        const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
-        const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
-        const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
-        const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
-        const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
-        const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
-        const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
-        const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
-        const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
-        const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
-        const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
-        const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
-        const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
-        const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
-        const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
-        const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
-        const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
-        const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
-        const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
-        // dct_const_round_shift
-        const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-        const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-        const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
-        const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
-        const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
-        const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
-        const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
-        const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
-        const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
-        const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
-        const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
-        const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
-        const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
-        const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
-        const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
-        const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
-        const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
-        const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
-        // Combine
-        out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
-        out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
-        out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
-        out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
-        out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
-        out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
-        out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
-        out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
-      }
+        // Stage 4
+        {
+          step1[0] = _mm256_add_epi16(step3[3], step3[0]);
+          step1[1] = _mm256_add_epi16(step3[2], step3[1]);
+          step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
+          step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
+          step1[8] = _mm256_add_epi16(step3[11], step2[8]);
+          step1[9] = _mm256_add_epi16(step3[10], step2[9]);
+          step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
+          step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
+          step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+          step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+          step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+          step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+        }
+        {
+          const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+          const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+          const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+          const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+          const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+          const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m256i s1_05_4 =
+              _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_05_5 =
+              _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_06_4 =
+              _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_06_5 =
+              _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+          const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+          const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+          const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+          // Combine
+          step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+          step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+        }
+        {
+          const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+          const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+          const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+          const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+          const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+          const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+          const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+          const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+          const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+          const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+          const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+          const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+          const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+          const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+          const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+          const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+          const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+          const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+          const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+          const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+          const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+          const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+          const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+          const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+          // dct_const_round_shift
+          const __m256i s1_18_4 =
+              _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_18_5 =
+              _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_19_4 =
+              _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_19_5 =
+              _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_20_4 =
+              _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_20_5 =
+              _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_21_4 =
+              _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_21_5 =
+              _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_26_4 =
+              _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_26_5 =
+              _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_27_4 =
+              _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_27_5 =
+              _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_28_4 =
+              _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_28_5 =
+              _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_29_4 =
+              _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+          const __m256i s1_29_5 =
+              _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+          const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+          const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+          const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+          const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+          const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+          const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+          const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+          const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+          const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+          const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+          const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+          const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+          const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+          const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+          const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+          const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+          // Combine
+          step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+          step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+          step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+          step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+          step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+          step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+          step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+          step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+        }
+        // Stage 5
+        {
+          step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+          step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+          step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+          step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+        }
+        {
+          const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+          const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+          const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+          const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+          const __m256i out_00_2 =
+              _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+          const __m256i out_00_3 =
+              _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+          const __m256i out_16_2 =
+              _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+          const __m256i out_16_3 =
+              _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+          const __m256i out_08_2 =
+              _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+          const __m256i out_08_3 =
+              _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+          const __m256i out_24_2 =
+              _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+          const __m256i out_24_3 =
+              _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m256i out_00_4 =
+              _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_00_5 =
+              _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_16_4 =
+              _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_16_5 =
+              _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_08_4 =
+              _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_08_5 =
+              _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_24_4 =
+              _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_24_5 =
+              _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+          const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+          const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+          const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+          const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+          const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+          const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+          const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+          // Combine
+          out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
+          out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+          out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
+          out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+        }
+        {
+          const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
+          const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
+          const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+          const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+          const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+          const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+          const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+          const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+          const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+          const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+          const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+          const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+          // dct_const_round_shift
+          const __m256i s2_09_4 =
+              _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+          const __m256i s2_09_5 =
+              _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+          const __m256i s2_10_4 =
+              _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s2_10_5 =
+              _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s2_13_4 =
+              _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s2_13_5 =
+              _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+          const __m256i s2_14_4 =
+              _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+          const __m256i s2_14_5 =
+              _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+          const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+          const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+          const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+          const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+          const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+          const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+          const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+          const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+          // Combine
+          step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+          step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+          step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+          step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+        }
+        {
+          step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+          step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+          step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+          step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+          step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+          step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+          step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+          step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+          step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+          step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+          step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+          step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+          step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+          step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+          step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+          step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+        }
+        // Stage 6
+        {
+          const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+          const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+          const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+          const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+          const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+          const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+          const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+          const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+          const __m256i out_04_2 =
+              _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+          const __m256i out_04_3 =
+              _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+          const __m256i out_20_2 =
+              _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+          const __m256i out_20_3 =
+              _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+          const __m256i out_12_2 =
+              _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+          const __m256i out_12_3 =
+              _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+          const __m256i out_28_2 =
+              _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+          const __m256i out_28_3 =
+              _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+          // dct_const_round_shift
+          const __m256i out_04_4 =
+              _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_04_5 =
+              _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_20_4 =
+              _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_20_5 =
+              _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_12_4 =
+              _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_12_5 =
+              _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_28_4 =
+              _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_28_5 =
+              _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+          const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+          const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+          const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+          const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+          const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+          const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+          const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+          // Combine
+          out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
+          out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+          out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+          out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+        }
+        {
+          step3[8] = _mm256_add_epi16(step2[9], step1[8]);
+          step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
+          step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+          step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+          step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+          step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+          step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+          step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+        }
+        {
+          const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+          const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+          const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+          const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+          const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+          const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+          const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+          const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+          const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+          const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+          const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+          const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+          const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+          const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+          const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+          const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+          const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+          const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+          const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+          const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+          const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+          const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+          const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+          const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+          // dct_const_round_shift
+          const __m256i s3_17_4 =
+              _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_17_5 =
+              _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_18_4 =
+              _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_18_5 =
+              _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_21_4 =
+              _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_21_5 =
+              _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_22_4 =
+              _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_22_5 =
+              _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+          const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+          const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+          const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+          const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+          const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+          const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+          const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+          const __m256i s3_25_4 =
+              _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_25_5 =
+              _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_26_4 =
+              _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_26_5 =
+              _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_29_4 =
+              _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_29_5 =
+              _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_30_4 =
+              _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_30_5 =
+              _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+          const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+          const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+          const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+          const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+          const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+          const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+          const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+          // Combine
+          step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+          step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+          step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+          step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+          // Combine
+          step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+          step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+          step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+          step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+        }
+        // Stage 7
+        {
+          const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
+          const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
+          const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
+          const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
+          const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+          const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+          const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+          const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+          const __m256i out_02_2 =
+              _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+          const __m256i out_02_3 =
+              _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+          const __m256i out_18_2 =
+              _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+          const __m256i out_18_3 =
+              _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+          const __m256i out_10_2 =
+              _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+          const __m256i out_10_3 =
+              _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+          const __m256i out_26_2 =
+              _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+          const __m256i out_26_3 =
+              _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+          const __m256i out_06_2 =
+              _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+          const __m256i out_06_3 =
+              _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+          const __m256i out_22_2 =
+              _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+          const __m256i out_22_3 =
+              _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+          const __m256i out_14_2 =
+              _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+          const __m256i out_14_3 =
+              _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+          const __m256i out_30_2 =
+              _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+          const __m256i out_30_3 =
+              _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+          // dct_const_round_shift
+          const __m256i out_02_4 =
+              _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_02_5 =
+              _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_18_4 =
+              _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_18_5 =
+              _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_10_4 =
+              _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_10_5 =
+              _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_26_4 =
+              _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_26_5 =
+              _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_06_4 =
+              _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_06_5 =
+              _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_22_4 =
+              _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_22_5 =
+              _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_14_4 =
+              _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_14_5 =
+              _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_30_4 =
+              _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_30_5 =
+              _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+          const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+          const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+          const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+          const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+          const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+          const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+          const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+          const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+          const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+          const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+          const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+          const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+          const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+          const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+          const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+          // Combine
+          out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
+          out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+          out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+          out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+          out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
+          out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+          out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+          out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+        }
+        {
+          step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+          step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+          step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+          step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+          step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+          step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+          step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+          step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+          step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+          step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+          step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+          step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+          step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+          step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+          step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+          step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+        }
+        // Final stage --- outputs indices are bit-reversed.
+        {
+          const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+          const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+          const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+          const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+          const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+          const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+          const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+          const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+          const __m256i out_01_2 =
+              _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+          const __m256i out_01_3 =
+              _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+          const __m256i out_17_2 =
+              _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+          const __m256i out_17_3 =
+              _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+          const __m256i out_09_2 =
+              _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+          const __m256i out_09_3 =
+              _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+          const __m256i out_25_2 =
+              _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+          const __m256i out_25_3 =
+              _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+          const __m256i out_07_2 =
+              _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+          const __m256i out_07_3 =
+              _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+          const __m256i out_23_2 =
+              _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+          const __m256i out_23_3 =
+              _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+          const __m256i out_15_2 =
+              _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+          const __m256i out_15_3 =
+              _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+          const __m256i out_31_2 =
+              _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+          const __m256i out_31_3 =
+              _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+          // dct_const_round_shift
+          const __m256i out_01_4 =
+              _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_01_5 =
+              _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_17_4 =
+              _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_17_5 =
+              _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_09_4 =
+              _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_09_5 =
+              _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_25_4 =
+              _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_25_5 =
+              _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_07_4 =
+              _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_07_5 =
+              _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_23_4 =
+              _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_23_5 =
+              _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_15_4 =
+              _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_15_5 =
+              _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_31_4 =
+              _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_31_5 =
+              _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+          const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+          const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+          const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+          const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+          const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+          const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+          const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+          const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+          const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+          const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+          const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+          const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+          const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+          const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+          const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+          // Combine
+          out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
+          out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+          out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
+          out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+          out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
+          out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+          out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+          out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+        }
+        {
+          const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+          const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+          const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+          const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+          const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+          const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+          const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+          const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+          const __m256i out_05_2 =
+              _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+          const __m256i out_05_3 =
+              _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+          const __m256i out_21_2 =
+              _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+          const __m256i out_21_3 =
+              _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+          const __m256i out_13_2 =
+              _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+          const __m256i out_13_3 =
+              _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+          const __m256i out_29_2 =
+              _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+          const __m256i out_29_3 =
+              _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+          const __m256i out_03_2 =
+              _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+          const __m256i out_03_3 =
+              _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+          const __m256i out_19_2 =
+              _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+          const __m256i out_19_3 =
+              _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+          const __m256i out_11_2 =
+              _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+          const __m256i out_11_3 =
+              _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+          const __m256i out_27_2 =
+              _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+          const __m256i out_27_3 =
+              _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+          // dct_const_round_shift
+          const __m256i out_05_4 =
+              _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_05_5 =
+              _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_21_4 =
+              _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_21_5 =
+              _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_13_4 =
+              _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_13_5 =
+              _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_29_4 =
+              _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_29_5 =
+              _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_03_4 =
+              _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_03_5 =
+              _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_19_4 =
+              _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_19_5 =
+              _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_11_4 =
+              _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_11_5 =
+              _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_27_4 =
+              _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+          const __m256i out_27_5 =
+              _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+          const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+          const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+          const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+          const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+          const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+          const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+          const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+          const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+          const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+          const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+          const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+          const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+          const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+          const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+          const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+          const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+          // Combine
+          out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
+          out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+          out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+          out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+          out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
+          out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+          out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+          out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+        }
 #if FDCT32x32_HIGH_PRECISION
       } else {
         __m256i lstep1[64], lstep2[64], lstep3[64];
@@ -1157,32 +1378,32 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         // stage 3
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
-          lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
-          lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
-          lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
-          lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
-          lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
-          lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
-          lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
-          lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
-          lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
-          lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
-          lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
-          lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
-          lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
-          lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
-          lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
-          lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
-          lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
-          lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
-          lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
-          lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
-          lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
-          lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
-          lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
-          lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
-          lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+          lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
+          lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
+          lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
+          lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
+          lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
+          lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
+          lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
+          lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
+          lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
+          lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
+          lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
+          lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
+          lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
+          lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
+          lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
+          lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
+          lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
+          lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
+          lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
+          lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
+          lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
+          lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
+          lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
+          lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
+          lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
+          lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
           lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
           lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
           lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
@@ -1190,22 +1411,22 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
           lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
 
-          lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
-          lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
-          lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
-          lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
-          lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
-          lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
-          lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
-          lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
-          lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
-          lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
-          lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
-          lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
-          lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
-          lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
-          lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
-          lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+          lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
+          lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
+          lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
+          lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
+          lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
+          lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
+          lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
+          lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
+          lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
+          lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
+          lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
+          lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
+          lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
+          lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
+          lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
+          lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
         }
         {
           const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
@@ -1221,14 +1442,22 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
           const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
           // dct_const_round_shift
-          const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_4 =
+              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_5 =
+              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_4 =
+              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_5 =
+              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_4 =
+              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_5 =
+              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_4 =
+              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_5 =
+              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
           lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
           lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
           lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
@@ -1343,10 +1572,10 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         // stage 4
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
-          lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
-          lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
-          lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
           lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
           lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
           lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
@@ -1360,14 +1589,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
           lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
 
-          lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
-          lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
-          lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
-          lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
-          lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
-          lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
-          lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
-          lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
+          lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
+          lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
+          lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
+          lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
+          lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
+          lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
+          lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
           lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
           lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
           lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
@@ -1386,57 +1615,62 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
         }
         {
-        // to be continued...
-        //
-        const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
-        const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
-
-        u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
-        u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
-        u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
-        u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-        // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
-        // instruction latency.
-        v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-        v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-        v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-        v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-        v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-        v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-        v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-        v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-
-        u[0] = k_packs_epi64_avx2(v[0], v[1]);
-        u[1] = k_packs_epi64_avx2(v[2], v[3]);
-        u[2] = k_packs_epi64_avx2(v[4], v[5]);
-        u[3] = k_packs_epi64_avx2(v[6], v[7]);
-
-        v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-        v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-        v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-        v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-        lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-        lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-        lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-        lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          // to be continued...
+          //
+          const __m256i k32_p16_p16 =
+              pair256_set_epi32(cospi_16_64, cospi_16_64);
+          const __m256i k32_p16_m16 =
+              pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+          u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+          u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+          u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+          // instruction latency.
+          v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+          v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+          v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+          v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+          v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+          lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
         }
         {
-          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+          const __m256i k32_m08_p24 =
+              pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 =
+              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 =
+              pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
           u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
           u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
           u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
@@ -1444,16 +1678,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
           u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
 
-          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
-          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
-          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
-          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
-          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
-          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
-          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
-          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
-          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
-          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+          v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
+          v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
           v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
           v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
           v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
@@ -1464,29 +1698,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
           v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
           v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
-          v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+          v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
+          v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
           v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
           v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
-          v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
-          v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
-          v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
-          v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
-          v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
-          v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
-          v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
-
-          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+          v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+          v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+          v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+          v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+          v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+          v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+          v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[9] = k_packs_epi64_avx2(v[18], v[19]);
           u[10] = k_packs_epi64_avx2(v[20], v[21]);
           u[11] = k_packs_epi64_avx2(v[22], v[23]);
           u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -1494,16 +1728,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = k_packs_epi64_avx2(v[28], v[29]);
           u[15] = k_packs_epi64_avx2(v[30], v[31]);
 
-          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -1511,16 +1745,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
-          lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
-          lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
-          lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
-          lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
-          lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
-          lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
-          lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
-          lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
-          lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+          lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+          lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
           lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
           lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
           lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -1530,20 +1764,24 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         }
         // stage 5
         {
-          lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
-          lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
-          lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
-          lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
+          lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
+          lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
+          lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
           lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
           lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
           lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
           lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
         }
         {
-          const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
-          const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
-          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_p16_p16 =
+              pair256_set_epi32(cospi_16_64, cospi_16_64);
+          const __m256i k32_p16_m16 =
+              pair256_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m256i k32_p24_p08 =
+              pair256_set_epi32(cospi_24_64, cospi_8_64);
+          const __m256i k32_m08_p24 =
+              pair256_set_epi32(-cospi_8_64, cospi_24_64);
 
           u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
           u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
@@ -1556,16 +1794,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
 
           // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
           // instruction latency.
-          v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-          v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-          v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-          v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-          v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-          v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-          v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-          v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-          v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
-          v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+          v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+          v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+          v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+          v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+          v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+          v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+          v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
           v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
           v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
           v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
@@ -1600,14 +1838,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
           u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
 
-          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
 
           u[0] = _mm256_sub_epi32(u[0], sign[0]);
           u[1] = _mm256_sub_epi32(u[1], sign[1]);
@@ -1637,15 +1875,18 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[7] = _mm256_srai_epi32(u[7], 2);
 
           // Combine
-          out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+          out[0] = _mm256_packs_epi32(u[0], u[1]);
           out[16] = _mm256_packs_epi32(u[2], u[3]);
-          out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+          out[8] = _mm256_packs_epi32(u[4], u[5]);
           out[24] = _mm256_packs_epi32(u[6], u[7]);
         }
         {
-          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+          const __m256i k32_m08_p24 =
+              pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 =
+              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 =
+              pair256_set_epi32(cospi_24_64, cospi_8_64);
 
           u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
           u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
@@ -1664,8 +1905,8 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
           v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
           v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
-          v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
           v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
           v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
           v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
@@ -1736,15 +1977,19 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         }
         // stage 6
         {
-          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
-          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
-          u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
-          u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
-          u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          const __m256i k32_p28_p04 =
+              pair256_set_epi32(cospi_28_64, cospi_4_64);
+          const __m256i k32_p12_p20 =
+              pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_m20_p12 =
+              pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m04_p28 =
+              pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+          u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+          u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+          u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
           u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
           u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
           u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
@@ -1753,10 +1998,10 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
           u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
           u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
-          u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
-          u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
-          u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
 
           v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
           v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
@@ -1766,8 +2011,8 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
           v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
           v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
-          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
-          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
           v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
           v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
           v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
@@ -1802,14 +2047,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
           u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
 
-          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
 
           u[0] = _mm256_sub_epi32(u[0], sign[0]);
           u[1] = _mm256_sub_epi32(u[1], sign[1]);
@@ -1838,7 +2083,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[6] = _mm256_srai_epi32(u[6], 2);
           u[7] = _mm256_srai_epi32(u[7], 2);
 
-          out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+          out[4] = _mm256_packs_epi32(u[0], u[1]);
           out[20] = _mm256_packs_epi32(u[2], u[3]);
           out[12] = _mm256_packs_epi32(u[4], u[5]);
           out[28] = _mm256_packs_epi32(u[6], u[7]);
@@ -1862,24 +2107,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
         }
         {
-          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
-                                                     -cospi_20_64);
-          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+          const __m256i k32_m04_p28 =
+              pair256_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m256i k32_m28_m04 =
+              pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m256i k32_m20_p12 =
+              pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m12_m20 =
+              pair256_set_epi32(-cospi_12_64, -cospi_20_64);
+          const __m256i k32_p12_p20 =
+              pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_p28_p04 =
+              pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
           u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
           u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
           u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
@@ -1887,16 +2137,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
           u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
 
-          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
-          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
-          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
-          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
-          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
-          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
-          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
-          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
-          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
-          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
+          v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
+          v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
+          v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
+          v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
+          v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
+          v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
+          v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
+          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
+          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
           v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
           v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
           v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
@@ -1907,29 +2157,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
           v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
           v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
-          v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+          v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
+          v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
           v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
           v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
-          v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
-          v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
-          v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
-          v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
-          v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
-          v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
-          v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
-
-          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
+          v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
+          v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
+          v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
+          v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+          v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+          v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+          v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[9] = k_packs_epi64_avx2(v[18], v[19]);
           u[10] = k_packs_epi64_avx2(v[20], v[21]);
           u[11] = k_packs_epi64_avx2(v[22], v[23]);
           u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -1937,16 +2187,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = k_packs_epi64_avx2(v[28], v[29]);
           u[15] = k_packs_epi64_avx2(v[30], v[31]);
 
-          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -1954,16 +2204,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
-          lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
-          lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
-          lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
-          lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
-          lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
-          lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
-          lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
-          lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
-          lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+          lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+          lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
           lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
           lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
           lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -1973,25 +2223,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         }
         // stage 7
         {
-          const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
-          const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
-          const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
-          const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64,  cospi_26_64);
-          const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+          const __m256i k32_p30_p02 =
+              pair256_set_epi32(cospi_30_64, cospi_2_64);
+          const __m256i k32_p14_p18 =
+              pair256_set_epi32(cospi_14_64, cospi_18_64);
+          const __m256i k32_p22_p10 =
+              pair256_set_epi32(cospi_22_64, cospi_10_64);
+          const __m256i k32_p06_p26 =
+              pair256_set_epi32(cospi_6_64, cospi_26_64);
+          const __m256i k32_m26_p06 =
+              pair256_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m256i k32_m10_p22 =
+              pair256_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m256i k32_m18_p14 =
+              pair256_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m256i k32_m02_p30 =
+              pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
           u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
           u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
           u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
@@ -1999,16 +2257,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
           u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
 
-          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
-          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
-          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
-          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
-          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
-          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
-          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
-          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
-          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
-          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+          v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
+          v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
+          v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
           v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
           v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
           v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
@@ -2019,29 +2277,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
           v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
           v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
-          v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+          v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
+          v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
           v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
           v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
-          v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
-          v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
-          v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
-          v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
-          v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
-          v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
-          v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
-
-          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
+          v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
+          v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
+          v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
+          v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
+          v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
+          v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
+          v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[9] = k_packs_epi64_avx2(v[18], v[19]);
           u[10] = k_packs_epi64_avx2(v[20], v[21]);
           u[11] = k_packs_epi64_avx2(v[22], v[23]);
           u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -2049,16 +2307,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = k_packs_epi64_avx2(v[28], v[29]);
           u[15] = k_packs_epi64_avx2(v[30], v[31]);
 
-          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2066,16 +2324,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
-          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
-          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
-          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
-          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
-          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
-          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
-          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
-          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
-          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
           u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
           u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
           u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2083,33 +2341,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
           u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
 
-          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
-          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
-          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
-          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
-          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
-          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
-          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
-          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
-          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
-          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
-          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
-
-          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
-          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
-          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
-          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
-          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
-          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
-          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
-          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
-          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
-          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+          u[0] = _mm256_sub_epi32(u[0], v[0]);
+          u[1] = _mm256_sub_epi32(u[1], v[1]);
+          u[2] = _mm256_sub_epi32(u[2], v[2]);
+          u[3] = _mm256_sub_epi32(u[3], v[3]);
+          u[4] = _mm256_sub_epi32(u[4], v[4]);
+          u[5] = _mm256_sub_epi32(u[5], v[5]);
+          u[6] = _mm256_sub_epi32(u[6], v[6]);
+          u[7] = _mm256_sub_epi32(u[7], v[7]);
+          u[8] = _mm256_sub_epi32(u[8], v[8]);
+          u[9] = _mm256_sub_epi32(u[9], v[9]);
           u[10] = _mm256_sub_epi32(u[10], v[10]);
           u[11] = _mm256_sub_epi32(u[11], v[11]);
           u[12] = _mm256_sub_epi32(u[12], v[12]);
@@ -2117,16 +2375,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_sub_epi32(u[14], v[14]);
           u[15] = _mm256_sub_epi32(u[15], v[15]);
 
-          v[ 0] = _mm256_add_epi32(u[ 0], K32One);
-          v[ 1] = _mm256_add_epi32(u[ 1], K32One);
-          v[ 2] = _mm256_add_epi32(u[ 2], K32One);
-          v[ 3] = _mm256_add_epi32(u[ 3], K32One);
-          v[ 4] = _mm256_add_epi32(u[ 4], K32One);
-          v[ 5] = _mm256_add_epi32(u[ 5], K32One);
-          v[ 6] = _mm256_add_epi32(u[ 6], K32One);
-          v[ 7] = _mm256_add_epi32(u[ 7], K32One);
-          v[ 8] = _mm256_add_epi32(u[ 8], K32One);
-          v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
           v[10] = _mm256_add_epi32(u[10], K32One);
           v[11] = _mm256_add_epi32(u[11], K32One);
           v[12] = _mm256_add_epi32(u[12], K32One);
@@ -2134,16 +2392,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[14] = _mm256_add_epi32(u[14], K32One);
           v[15] = _mm256_add_epi32(u[15], K32One);
 
-          u[ 0] = _mm256_srai_epi32(v[ 0], 2);
-          u[ 1] = _mm256_srai_epi32(v[ 1], 2);
-          u[ 2] = _mm256_srai_epi32(v[ 2], 2);
-          u[ 3] = _mm256_srai_epi32(v[ 3], 2);
-          u[ 4] = _mm256_srai_epi32(v[ 4], 2);
-          u[ 5] = _mm256_srai_epi32(v[ 5], 2);
-          u[ 6] = _mm256_srai_epi32(v[ 6], 2);
-          u[ 7] = _mm256_srai_epi32(v[ 7], 2);
-          u[ 8] = _mm256_srai_epi32(v[ 8], 2);
-          u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
           u[10] = _mm256_srai_epi32(v[10], 2);
           u[11] = _mm256_srai_epi32(v[11], 2);
           u[12] = _mm256_srai_epi32(v[12], 2);
@@ -2151,11 +2409,11 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_srai_epi32(v[14], 2);
           u[15] = _mm256_srai_epi32(v[15], 2);
 
-          out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+          out[2] = _mm256_packs_epi32(u[0], u[1]);
           out[18] = _mm256_packs_epi32(u[2], u[3]);
           out[10] = _mm256_packs_epi32(u[4], u[5]);
           out[26] = _mm256_packs_epi32(u[6], u[7]);
-          out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+          out[6] = _mm256_packs_epi32(u[8], u[9]);
           out[22] = _mm256_packs_epi32(u[10], u[11]);
           out[14] = _mm256_packs_epi32(u[12], u[13]);
           out[30] = _mm256_packs_epi32(u[14], u[15]);
@@ -2196,25 +2454,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
         }
         // stage 8
         {
-          const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
-          const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
-          const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
-          const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
-          const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+          const __m256i k32_p31_p01 =
+              pair256_set_epi32(cospi_31_64, cospi_1_64);
+          const __m256i k32_p15_p17 =
+              pair256_set_epi32(cospi_15_64, cospi_17_64);
+          const __m256i k32_p23_p09 =
+              pair256_set_epi32(cospi_23_64, cospi_9_64);
+          const __m256i k32_p07_p25 =
+              pair256_set_epi32(cospi_7_64, cospi_25_64);
+          const __m256i k32_m25_p07 =
+              pair256_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m256i k32_m09_p23 =
+              pair256_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m256i k32_m17_p15 =
+              pair256_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m256i k32_m01_p31 =
+              pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
           u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
           u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
           u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
@@ -2222,16 +2488,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
           u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
 
-          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
-          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
-          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
-          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
-          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
-          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
-          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
-          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
-          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
-          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+          v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
+          v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
+          v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
           v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
           v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
           v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
@@ -2242,29 +2508,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
           v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
           v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
-          v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+          v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
+          v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
           v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
           v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
-          v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
-          v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
-          v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
-          v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
-          v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
-          v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
-          v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
-
-          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
+          v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
+          v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
+          v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
+          v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
+          v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
+          v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
+          v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[9] = k_packs_epi64_avx2(v[18], v[19]);
           u[10] = k_packs_epi64_avx2(v[20], v[21]);
           u[11] = k_packs_epi64_avx2(v[22], v[23]);
           u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -2272,16 +2538,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = k_packs_epi64_avx2(v[28], v[29]);
           u[15] = k_packs_epi64_avx2(v[30], v[31]);
 
-          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2289,16 +2555,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
-          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
-          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
-          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
-          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
-          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
-          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
-          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
-          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
-          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
           u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
           u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
           u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2306,33 +2572,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
           u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
 
-          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
-          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
-          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
-          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
-          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
-          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
-          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
-          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
-          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
-          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
-          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
-
-          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
-          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
-          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
-          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
-          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
-          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
-          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
-          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
-          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
-          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+          u[0] = _mm256_sub_epi32(u[0], v[0]);
+          u[1] = _mm256_sub_epi32(u[1], v[1]);
+          u[2] = _mm256_sub_epi32(u[2], v[2]);
+          u[3] = _mm256_sub_epi32(u[3], v[3]);
+          u[4] = _mm256_sub_epi32(u[4], v[4]);
+          u[5] = _mm256_sub_epi32(u[5], v[5]);
+          u[6] = _mm256_sub_epi32(u[6], v[6]);
+          u[7] = _mm256_sub_epi32(u[7], v[7]);
+          u[8] = _mm256_sub_epi32(u[8], v[8]);
+          u[9] = _mm256_sub_epi32(u[9], v[9]);
           u[10] = _mm256_sub_epi32(u[10], v[10]);
           u[11] = _mm256_sub_epi32(u[11], v[11]);
           u[12] = _mm256_sub_epi32(u[12], v[12]);
@@ -2374,35 +2640,43 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_srai_epi32(v[14], 2);
           u[15] = _mm256_srai_epi32(v[15], 2);
 
-          out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+          out[1] = _mm256_packs_epi32(u[0], u[1]);
           out[17] = _mm256_packs_epi32(u[2], u[3]);
-          out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+          out[9] = _mm256_packs_epi32(u[4], u[5]);
           out[25] = _mm256_packs_epi32(u[6], u[7]);
-          out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+          out[7] = _mm256_packs_epi32(u[8], u[9]);
           out[23] = _mm256_packs_epi32(u[10], u[11]);
           out[15] = _mm256_packs_epi32(u[12], u[13]);
           out[31] = _mm256_packs_epi32(u[14], u[15]);
         }
         {
-          const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
-          const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
-          const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
-          const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
-          const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+          const __m256i k32_p27_p05 =
+              pair256_set_epi32(cospi_27_64, cospi_5_64);
+          const __m256i k32_p11_p21 =
+              pair256_set_epi32(cospi_11_64, cospi_21_64);
+          const __m256i k32_p19_p13 =
+              pair256_set_epi32(cospi_19_64, cospi_13_64);
+          const __m256i k32_p03_p29 =
+              pair256_set_epi32(cospi_3_64, cospi_29_64);
+          const __m256i k32_m29_p03 =
+              pair256_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m256i k32_m13_p19 =
+              pair256_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m256i k32_m21_p11 =
+              pair256_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m256i k32_m05_p27 =
+              pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
           u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
           u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
           u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
@@ -2410,16 +2684,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
           u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
 
-          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
-          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
-          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
-          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
-          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
-          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
-          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
-          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
-          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
-          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+          v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
+          v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
+          v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
           v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
           v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
           v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
@@ -2430,29 +2704,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
           v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
           v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
-          v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+          v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
+          v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
           v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
           v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
-          v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
-          v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
-          v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
-          v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
-          v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
-          v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
-          v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
-
-          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
+          v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
+          v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
+          v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
+          v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
+          v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
+          v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
+          v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[9] = k_packs_epi64_avx2(v[18], v[19]);
           u[10] = k_packs_epi64_avx2(v[20], v[21]);
           u[11] = k_packs_epi64_avx2(v[22], v[23]);
           u[12] = k_packs_epi64_avx2(v[24], v[25]);
@@ -2460,16 +2734,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = k_packs_epi64_avx2(v[28], v[29]);
           u[15] = k_packs_epi64_avx2(v[30], v[31]);
 
-          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2477,16 +2751,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
-          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
-          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
-          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
-          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
-          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
-          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
-          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
-          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
-          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
+          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
           u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
           u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
           u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2494,33 +2768,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
           u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
 
-          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
-          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
-          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
-          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
-          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
-          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
-          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
-          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
-          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
-          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
-          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
-
-          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
-          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
-          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
-          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
-          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
-          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
-          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
-          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
-          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
-          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
+          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
+          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
+          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
+          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
+          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
+          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
+          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
+          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
+          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
+          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
+
+          u[0] = _mm256_sub_epi32(u[0], v[0]);
+          u[1] = _mm256_sub_epi32(u[1], v[1]);
+          u[2] = _mm256_sub_epi32(u[2], v[2]);
+          u[3] = _mm256_sub_epi32(u[3], v[3]);
+          u[4] = _mm256_sub_epi32(u[4], v[4]);
+          u[5] = _mm256_sub_epi32(u[5], v[5]);
+          u[6] = _mm256_sub_epi32(u[6], v[6]);
+          u[7] = _mm256_sub_epi32(u[7], v[7]);
+          u[8] = _mm256_sub_epi32(u[8], v[8]);
+          u[9] = _mm256_sub_epi32(u[9], v[9]);
           u[10] = _mm256_sub_epi32(u[10], v[10]);
           u[11] = _mm256_sub_epi32(u[11], v[11]);
           u[12] = _mm256_sub_epi32(u[12], v[12]);
@@ -2562,11 +2836,11 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           u[14] = _mm256_srai_epi32(v[14], 2);
           u[15] = _mm256_srai_epi32(v[15], 2);
 
-          out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+          out[5] = _mm256_packs_epi32(u[0], u[1]);
           out[21] = _mm256_packs_epi32(u[2], u[3]);
           out[13] = _mm256_packs_epi32(u[4], u[5]);
           out[29] = _mm256_packs_epi32(u[6], u[7]);
-          out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+          out[3] = _mm256_packs_epi32(u[8], u[9]);
           out[19] = _mm256_packs_epi32(u[10], u[11]);
           out[11] = _mm256_packs_epi32(u[12], u[13]);
           out[27] = _mm256_packs_epi32(u[14], u[15]);
@@ -2576,13 +2850,13 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
       // Transpose the results, do it as four 8x8 transposes.
       {
         int transpose_block;
-        int16_t *output_currStep,*output_nextStep;
-        if (0 == pass){
-                 output_currStep = &intermediate[column_start * 32];
-                 output_nextStep = &intermediate[(column_start + 8) * 32];
-        } else{
-                 output_currStep = &output_org[column_start * 32];
-                 output_nextStep = &output_org[(column_start + 8) * 32];
+        int16_t *output_currStep, *output_nextStep;
+        if (0 == pass) {
+          output_currStep = &intermediate[column_start * 32];
+          output_nextStep = &intermediate[(column_start + 8) * 32];
+        } else {
+          output_currStep = &output_org[column_start * 32];
+          output_nextStep = &output_org[(column_start + 8) * 32];
         }
         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
           __m256i *this_out = &out[8 * transpose_block];
@@ -2685,23 +2959,39 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
           }
           // Note: even though all these stores are aligned, using the aligned
           //       intrinsic make the code slightly slower.
-          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
-          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
-          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
-          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
-          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
-          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
-          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
-          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
-
-          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+                           _mm256_castsi256_si128(tr2_0));
+          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+                           _mm256_castsi256_si128(tr2_1));
+          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+                           _mm256_castsi256_si128(tr2_2));
+          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+                           _mm256_castsi256_si128(tr2_3));
+          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+                           _mm256_castsi256_si128(tr2_4));
+          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+                           _mm256_castsi256_si128(tr2_5));
+          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+                           _mm256_castsi256_si128(tr2_6));
+          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+                           _mm256_castsi256_si128(tr2_7));
+
+          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+                           _mm256_extractf128_si256(tr2_0, 1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+                           _mm256_extractf128_si256(tr2_1, 1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+                           _mm256_extractf128_si256(tr2_2, 1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+                           _mm256_extractf128_si256(tr2_3, 1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+                           _mm256_extractf128_si256(tr2_4, 1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+                           _mm256_extractf128_si256(tr2_5, 1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+                           _mm256_extractf128_si256(tr2_6, 1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+                           _mm256_extractf128_si256(tr2_7, 1));
           // Process next 8x8
           output_currStep += 8;
           output_nextStep += 8;
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index b85ae103fa470a6c2356e671653070a8ae0eef62..37443339094b30acd90437e44c3e8b6007d32055 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -22,42 +22,37 @@
 #define SUB_EPI16 _mm_subs_epi16
 #if FDCT32x32_HIGH_PRECISION
 void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
-    int i, j;
-    for (i = 0; i < 32; ++i) {
-      tran_high_t temp_in[32], temp_out[32];
-      for (j = 0; j < 32; ++j)
-        temp_in[j] = intermediate[j * 32 + i];
-      vpx_fdct32(temp_in, temp_out, 0);
-      for (j = 0; j < 32; ++j)
-        out[j + i * 32] =
-            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-    }
+  int i, j;
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+    vpx_fdct32(temp_in, temp_out, 0);
+    for (j = 0; j < 32; ++j)
+      out[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
 }
-  #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
-  #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
 #else
 void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
-    int i, j;
-    for (i = 0; i < 32; ++i) {
-      tran_high_t temp_in[32], temp_out[32];
-      for (j = 0; j < 32; ++j)
-        temp_in[j] = intermediate[j * 32 + i];
-      vpx_fdct32(temp_in, temp_out, 1);
-      for (j = 0; j < 32; ++j)
-        out[j + i * 32] = (tran_low_t)temp_out[j];
-    }
+  int i, j;
+  for (i = 0; i < 32; ++i) {
+    tran_high_t temp_in[32], temp_out[32];
+    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
+    vpx_fdct32(temp_in, temp_out, 1);
+    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+  }
 }
-  #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
-  #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
+#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
+#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
 #endif  // FDCT32x32_HIGH_PRECISION
 #else
 #define ADD_EPI16 _mm_add_epi16
 #define SUB_EPI16 _mm_sub_epi16
 #endif  // DCT_HIGH_BIT_DEPTH
 
-
-void FDCT32x32_2D(const int16_t *input,
-                  tran_low_t *output_org, int stride) {
+void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   // Calculate pre-multiplied strides
   const int str1 = stride;
   const int str2 = 2 * stride;
@@ -70,42 +65,42 @@ void FDCT32x32_2D(const int16_t *input,
   //    by constructing the 32 bit constant corresponding to that pair.
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64,  cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64,  cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64,  cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64,   cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
   const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
   const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64,  cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64,  cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64,  cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64,   cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64,  cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64,  cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64,  cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64,   cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64,  cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64,  cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64,  cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64,   cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64,  cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64,   cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64,  cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64,   cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64,  cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64,  cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64,  cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64,   cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64,  cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64,  cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64,  cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i kOne  = _mm_set1_epi16(1);
+  const __m128i kOne = _mm_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
 #if DCT_HIGH_BIT_DEPTH
@@ -123,125 +118,125 @@ void FDCT32x32_2D(const int16_t *input,
       // Note: even though all the loads below are aligned, using the aligned
       //       intrinsic make the code slightly slower.
       if (0 == pass) {
-        const int16_t *in  = &input[column_start];
+        const int16_t *in = &input[column_start];
         // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
         // Note: the next four blocks could be in a loop. That would help the
         //       instruction cache but is actually slower.
         {
-          const int16_t *ina =  in +  0 * str1;
-          const int16_t *inb =  in + 31 * str1;
-          __m128i *step1a = &step1[ 0];
+          const int16_t *ina = in + 0 * str1;
+          const int16_t *inb = in + 31 * str1;
+          __m128i *step1a = &step1[0];
           __m128i *step1b = &step1[31];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[0] = _mm_add_epi16(ina0, inb0);
+          step1a[1] = _mm_add_epi16(ina1, inb1);
+          step1a[2] = _mm_add_epi16(ina2, inb2);
+          step1a[3] = _mm_add_epi16(ina3, inb3);
           step1b[-3] = _mm_sub_epi16(ina3, inb3);
           step1b[-2] = _mm_sub_epi16(ina2, inb2);
           step1b[-1] = _mm_sub_epi16(ina1, inb1);
           step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          const int16_t *ina =  in +  4 * str1;
-          const int16_t *inb =  in + 27 * str1;
-          __m128i *step1a = &step1[ 4];
+          const int16_t *ina = in + 4 * str1;
+          const int16_t *inb = in + 27 * str1;
+          __m128i *step1a = &step1[4];
           __m128i *step1b = &step1[27];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[0] = _mm_add_epi16(ina0, inb0);
+          step1a[1] = _mm_add_epi16(ina1, inb1);
+          step1a[2] = _mm_add_epi16(ina2, inb2);
+          step1a[3] = _mm_add_epi16(ina3, inb3);
           step1b[-3] = _mm_sub_epi16(ina3, inb3);
           step1b[-2] = _mm_sub_epi16(ina2, inb2);
           step1b[-1] = _mm_sub_epi16(ina1, inb1);
           step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          const int16_t *ina =  in +  8 * str1;
-          const int16_t *inb =  in + 23 * str1;
-          __m128i *step1a = &step1[ 8];
+          const int16_t *ina = in + 8 * str1;
+          const int16_t *inb = in + 23 * str1;
+          __m128i *step1a = &step1[8];
           __m128i *step1b = &step1[23];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[0] = _mm_add_epi16(ina0, inb0);
+          step1a[1] = _mm_add_epi16(ina1, inb1);
+          step1a[2] = _mm_add_epi16(ina2, inb2);
+          step1a[3] = _mm_add_epi16(ina3, inb3);
           step1b[-3] = _mm_sub_epi16(ina3, inb3);
           step1b[-2] = _mm_sub_epi16(ina2, inb2);
           step1b[-1] = _mm_sub_epi16(ina1, inb1);
           step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
         }
         {
-          const int16_t *ina =  in + 12 * str1;
-          const int16_t *inb =  in + 19 * str1;
+          const int16_t *ina = in + 12 * str1;
+          const int16_t *inb = in + 19 * str1;
           __m128i *step1a = &step1[12];
           __m128i *step1b = &step1[19];
-          const __m128i ina0  = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1  = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2  = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3  = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3  = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2  = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1  = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0  = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[ 0] = _mm_add_epi16(ina0, inb0);
-          step1a[ 1] = _mm_add_epi16(ina1, inb1);
-          step1a[ 2] = _mm_add_epi16(ina2, inb2);
-          step1a[ 3] = _mm_add_epi16(ina3, inb3);
+          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
+          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
+          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
+          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
+          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
+          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
+          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+          step1a[0] = _mm_add_epi16(ina0, inb0);
+          step1a[1] = _mm_add_epi16(ina1, inb1);
+          step1a[2] = _mm_add_epi16(ina2, inb2);
+          step1a[3] = _mm_add_epi16(ina3, inb3);
           step1b[-3] = _mm_sub_epi16(ina3, inb3);
           step1b[-2] = _mm_sub_epi16(ina2, inb2);
           step1b[-1] = _mm_sub_epi16(ina1, inb1);
           step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
-          step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
-          step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
-          step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
+          step1a[0] = _mm_slli_epi16(step1a[0], 2);
+          step1a[1] = _mm_slli_epi16(step1a[1], 2);
+          step1a[2] = _mm_slli_epi16(step1a[2], 2);
+          step1a[3] = _mm_slli_epi16(step1a[3], 2);
           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
@@ -256,14 +251,14 @@ void FDCT32x32_2D(const int16_t *input,
         // Note: the next four blocks could be in a loop. That would help the
         //       instruction cache but is actually slower.
         {
-          __m128i in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 32));
-          __m128i in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 32));
-          __m128i in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 32));
-          __m128i in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 32));
-          __m128i in28  = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
-          __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
-          __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
-          __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
+          __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
+          __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
+          __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
+          __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
+          __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
+          __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
+          __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
+          __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
           step1[0] = ADD_EPI16(in00, in31);
           step1[1] = ADD_EPI16(in01, in30);
           step1[2] = ADD_EPI16(in02, in29);
@@ -283,14 +278,14 @@ void FDCT32x32_2D(const int16_t *input,
 #endif  // DCT_HIGH_BIT_DEPTH
         }
         {
-          __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
-          __m128i in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 32));
-          __m128i in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 32));
-          __m128i in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 32));
-          __m128i in24  = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
-          __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
-          __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
-          __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
+          __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
+          __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
+          __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
+          __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
+          __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
+          __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
+          __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
+          __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
           step1[4] = ADD_EPI16(in04, in27);
           step1[5] = ADD_EPI16(in05, in26);
           step1[6] = ADD_EPI16(in06, in25);
@@ -310,14 +305,14 @@ void FDCT32x32_2D(const int16_t *input,
 #endif  // DCT_HIGH_BIT_DEPTH
         }
         {
-          __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
-          __m128i in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 32));
-          __m128i in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
-          __m128i in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
-          __m128i in20  = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
-          __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
-          __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
-          __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
+          __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
+          __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
+          __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
+          __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
+          __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
+          __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
+          __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
+          __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
           step1[8] = ADD_EPI16(in08, in23);
           step1[9] = ADD_EPI16(in09, in22);
           step1[10] = ADD_EPI16(in10, in21);
@@ -337,14 +332,14 @@ void FDCT32x32_2D(const int16_t *input,
 #endif  // DCT_HIGH_BIT_DEPTH
         }
         {
-          __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
-          __m128i in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
-          __m128i in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
-          __m128i in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
-          __m128i in16  = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
-          __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
-          __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
-          __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
+          __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
+          __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
+          __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
+          __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
+          __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
+          __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
+          __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
+          __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
           step1[12] = ADD_EPI16(in12, in19);
           step1[13] = ADD_EPI16(in13, in18);
           step1[14] = ADD_EPI16(in14, in17);
@@ -372,10 +367,10 @@ void FDCT32x32_2D(const int16_t *input,
         step2[3] = ADD_EPI16(step1[3], step1[12]);
         step2[4] = ADD_EPI16(step1[4], step1[11]);
         step2[5] = ADD_EPI16(step1[5], step1[10]);
-        step2[6] = ADD_EPI16(step1[6], step1[ 9]);
-        step2[7] = ADD_EPI16(step1[7], step1[ 8]);
-        step2[8] = SUB_EPI16(step1[7], step1[ 8]);
-        step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+        step2[6] = ADD_EPI16(step1[6], step1[9]);
+        step2[7] = ADD_EPI16(step1[7], step1[8]);
+        step2[8] = SUB_EPI16(step1[7], step1[8]);
+        step2[9] = SUB_EPI16(step1[6], step1[9]);
         step2[10] = SUB_EPI16(step1[5], step1[10]);
         step2[11] = SUB_EPI16(step1[4], step1[11]);
         step2[12] = SUB_EPI16(step1[3], step1[12]);
@@ -384,9 +379,8 @@ void FDCT32x32_2D(const int16_t *input,
         step2[15] = SUB_EPI16(step1[0], step1[15]);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x16(
-            &step2[0], &step2[1], &step2[2], &step2[3],
-            &step2[4], &step2[5], &step2[6], &step2[7],
-            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
             &step2[12], &step2[13], &step2[14], &step2[15]);
         if (overflow) {
           if (pass == 0)
@@ -482,16 +476,16 @@ void FDCT32x32_2D(const int16_t *input,
       // dump the magnitude by half, hence the intermediate values are within
       // the range of 16 bits.
       if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+        __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
+        __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
+        __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
+        __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
+        __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
+        __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
+        __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
+        __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
+        __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
+        __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
         __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
         __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
         __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
@@ -515,16 +509,16 @@ void FDCT32x32_2D(const int16_t *input,
         __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
         __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
 
-        step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
-        step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
-        step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
-        step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
-        step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
-        step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
-        step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
-        step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
-        step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
-        step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+        step2[0] = SUB_EPI16(step2[0], s3_00_0);
+        step2[1] = SUB_EPI16(step2[1], s3_01_0);
+        step2[2] = SUB_EPI16(step2[2], s3_02_0);
+        step2[3] = SUB_EPI16(step2[3], s3_03_0);
+        step2[4] = SUB_EPI16(step2[4], s3_04_0);
+        step2[5] = SUB_EPI16(step2[5], s3_05_0);
+        step2[6] = SUB_EPI16(step2[6], s3_06_0);
+        step2[7] = SUB_EPI16(step2[7], s3_07_0);
+        step2[8] = SUB_EPI16(step2[8], s2_08_0);
+        step2[9] = SUB_EPI16(step2[9], s2_09_0);
         step2[10] = SUB_EPI16(step2[10], s3_10_0);
         step2[11] = SUB_EPI16(step2[11], s3_11_0);
         step2[12] = SUB_EPI16(step2[12], s3_12_0);
@@ -549,29 +543,27 @@ void FDCT32x32_2D(const int16_t *input,
         step1[31] = SUB_EPI16(step1[31], s3_31_0);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x32(
-            &step2[0], &step2[1], &step2[2], &step2[3],
-            &step2[4], &step2[5], &step2[6], &step2[7],
-            &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15],
-            &step1[16], &step1[17], &step1[18], &step1[19],
-            &step2[20], &step2[21], &step2[22], &step2[23],
-            &step2[24], &step2[25], &step2[26], &step2[27],
-            &step1[28], &step1[29], &step1[30], &step1[31]);
+            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
+            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
+            &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
+            &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
+            &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
         if (overflow) {
           HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
           return;
         }
 #endif  // DCT_HIGH_BIT_DEPTH
-        step2[0] = _mm_add_epi16(step2[ 0], kOne);
-        step2[1] = _mm_add_epi16(step2[ 1], kOne);
-        step2[2] = _mm_add_epi16(step2[ 2], kOne);
-        step2[3] = _mm_add_epi16(step2[ 3], kOne);
-        step2[4] = _mm_add_epi16(step2[ 4], kOne);
-        step2[5] = _mm_add_epi16(step2[ 5], kOne);
-        step2[6] = _mm_add_epi16(step2[ 6], kOne);
-        step2[7] = _mm_add_epi16(step2[ 7], kOne);
-        step2[8] = _mm_add_epi16(step2[ 8], kOne);
-        step2[9] = _mm_add_epi16(step2[ 9], kOne);
+        step2[0] = _mm_add_epi16(step2[0], kOne);
+        step2[1] = _mm_add_epi16(step2[1], kOne);
+        step2[2] = _mm_add_epi16(step2[2], kOne);
+        step2[3] = _mm_add_epi16(step2[3], kOne);
+        step2[4] = _mm_add_epi16(step2[4], kOne);
+        step2[5] = _mm_add_epi16(step2[5], kOne);
+        step2[6] = _mm_add_epi16(step2[6], kOne);
+        step2[7] = _mm_add_epi16(step2[7], kOne);
+        step2[8] = _mm_add_epi16(step2[8], kOne);
+        step2[9] = _mm_add_epi16(step2[9], kOne);
         step2[10] = _mm_add_epi16(step2[10], kOne);
         step2[11] = _mm_add_epi16(step2[11], kOne);
         step2[12] = _mm_add_epi16(step2[12], kOne);
@@ -595,16 +587,16 @@ void FDCT32x32_2D(const int16_t *input,
         step1[30] = _mm_add_epi16(step1[30], kOne);
         step1[31] = _mm_add_epi16(step1[31], kOne);
 
-        step2[0] = _mm_srai_epi16(step2[ 0], 2);
-        step2[1] = _mm_srai_epi16(step2[ 1], 2);
-        step2[2] = _mm_srai_epi16(step2[ 2], 2);
-        step2[3] = _mm_srai_epi16(step2[ 3], 2);
-        step2[4] = _mm_srai_epi16(step2[ 4], 2);
-        step2[5] = _mm_srai_epi16(step2[ 5], 2);
-        step2[6] = _mm_srai_epi16(step2[ 6], 2);
-        step2[7] = _mm_srai_epi16(step2[ 7], 2);
-        step2[8] = _mm_srai_epi16(step2[ 8], 2);
-        step2[9] = _mm_srai_epi16(step2[ 9], 2);
+        step2[0] = _mm_srai_epi16(step2[0], 2);
+        step2[1] = _mm_srai_epi16(step2[1], 2);
+        step2[2] = _mm_srai_epi16(step2[2], 2);
+        step2[3] = _mm_srai_epi16(step2[3], 2);
+        step2[4] = _mm_srai_epi16(step2[4], 2);
+        step2[5] = _mm_srai_epi16(step2[5], 2);
+        step2[6] = _mm_srai_epi16(step2[6], 2);
+        step2[7] = _mm_srai_epi16(step2[7], 2);
+        step2[8] = _mm_srai_epi16(step2[8], 2);
+        step2[9] = _mm_srai_epi16(step2[9], 2);
         step2[10] = _mm_srai_epi16(step2[10], 2);
         step2[11] = _mm_srai_epi16(step2[11], 2);
         step2[12] = _mm_srai_epi16(step2[12], 2);
@@ -633,821 +625,884 @@ void FDCT32x32_2D(const int16_t *input,
 #if FDCT32x32_HIGH_PRECISION
       if (pass == 0) {
 #endif
-      // Stage 3
-      {
-        step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
-        step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
-        step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
-        step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
-        step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
-        step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
-        step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
-        step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+        // Stage 3
+        {
+          step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+          step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+          step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+          step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+          step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+          step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+          step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+          step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
-                                           &step3[3], &step3[4], &step3[5],
-                                           &step3[6], &step3[7]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
+          overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+                                             &step3[3], &step3[4], &step3[5],
+                                             &step3[6], &step3[7]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-        const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-        const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-        const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-        const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-        const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-        const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-        const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-        const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-        const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-        const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-        const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-        const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-        const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-        const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-        const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-        const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-        const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-        const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        // Combine
-        step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-        step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-        step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-        step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
-                                           &step3[12], &step3[13]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
         }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        step3[16] = ADD_EPI16(step2[23], step1[16]);
-        step3[17] = ADD_EPI16(step2[22], step1[17]);
-        step3[18] = ADD_EPI16(step2[21], step1[18]);
-        step3[19] = ADD_EPI16(step2[20], step1[19]);
-        step3[20] = SUB_EPI16(step1[19], step2[20]);
-        step3[21] = SUB_EPI16(step1[18], step2[21]);
-        step3[22] = SUB_EPI16(step1[17], step2[22]);
-        step3[23] = SUB_EPI16(step1[16], step2[23]);
-        step3[24] = SUB_EPI16(step1[31], step2[24]);
-        step3[25] = SUB_EPI16(step1[30], step2[25]);
-        step3[26] = SUB_EPI16(step1[29], step2[26]);
-        step3[27] = SUB_EPI16(step1[28], step2[27]);
-        step3[28] = ADD_EPI16(step2[27], step1[28]);
-        step3[29] = ADD_EPI16(step2[26], step1[29]);
-        step3[30] = ADD_EPI16(step2[25], step1[30]);
-        step3[31] = ADD_EPI16(step2[24], step1[31]);
+        {
+          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+          // Combine
+          step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+          step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+          step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+          step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step3[16], &step3[17], &step3[18], &step3[19],
-            &step3[20], &step3[21], &step3[22], &step3[23],
-            &step3[24], &step3[25], &step3[26], &step3[27],
-            &step3[28], &step3[29], &step3[30], &step3[31]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
+          overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
+                                             &step3[13]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
+        {
+          step3[16] = ADD_EPI16(step2[23], step1[16]);
+          step3[17] = ADD_EPI16(step2[22], step1[17]);
+          step3[18] = ADD_EPI16(step2[21], step1[18]);
+          step3[19] = ADD_EPI16(step2[20], step1[19]);
+          step3[20] = SUB_EPI16(step1[19], step2[20]);
+          step3[21] = SUB_EPI16(step1[18], step2[21]);
+          step3[22] = SUB_EPI16(step1[17], step2[22]);
+          step3[23] = SUB_EPI16(step1[16], step2[23]);
+          step3[24] = SUB_EPI16(step1[31], step2[24]);
+          step3[25] = SUB_EPI16(step1[30], step2[25]);
+          step3[26] = SUB_EPI16(step1[29], step2[26]);
+          step3[27] = SUB_EPI16(step1[28], step2[27]);
+          step3[28] = ADD_EPI16(step2[27], step1[28]);
+          step3[29] = ADD_EPI16(step2[26], step1[29]);
+          step3[30] = ADD_EPI16(step2[25], step1[30]);
+          step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x16(
+              &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
+              &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
+              &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
+              &step3[31]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
+        }
 
-      // Stage 4
-      {
-        step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
-        step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
-        step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
-        step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
-        step1[8] = ADD_EPI16(step3[11], step2[ 8]);
-        step1[9] = ADD_EPI16(step3[10], step2[ 9]);
-        step1[10] = SUB_EPI16(step2[ 9], step3[10]);
-        step1[11] = SUB_EPI16(step2[ 8], step3[11]);
-        step1[12] = SUB_EPI16(step2[15], step3[12]);
-        step1[13] = SUB_EPI16(step2[14], step3[13]);
-        step1[14] = ADD_EPI16(step3[13], step2[14]);
-        step1[15] = ADD_EPI16(step3[12], step2[15]);
+        // Stage 4
+        {
+          step1[0] = ADD_EPI16(step3[3], step3[0]);
+          step1[1] = ADD_EPI16(step3[2], step3[1]);
+          step1[2] = SUB_EPI16(step3[1], step3[2]);
+          step1[3] = SUB_EPI16(step3[0], step3[3]);
+          step1[8] = ADD_EPI16(step3[11], step2[8]);
+          step1[9] = ADD_EPI16(step3[10], step2[9]);
+          step1[10] = SUB_EPI16(step2[9], step3[10]);
+          step1[11] = SUB_EPI16(step2[8], step3[11]);
+          step1[12] = SUB_EPI16(step2[15], step3[12]);
+          step1[13] = SUB_EPI16(step2[14], step3[13]);
+          step1[14] = ADD_EPI16(step3[13], step2[14]);
+          step1[15] = ADD_EPI16(step3[12], step2[15]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step1[0], &step1[1], &step1[2], &step1[3],
-            &step1[4], &step1[5], &step1[6], &step1[7],
-            &step1[8], &step1[9], &step1[10], &step1[11],
-            &step1[12], &step1[13], &step1[14], &step1[15]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
+          overflow = check_epi16_overflow_x16(
+              &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
+              &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
+              &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-        const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-        const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-        const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-        const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-        const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-        const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-        const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-        const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-        // Combine
-        step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-        step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
         }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-        const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-        const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-        const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-        const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-        const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-        const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-        const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-        const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-        const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-        const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-        const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-        const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-        const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-        const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-        const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-        const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-        const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-        const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-        const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-        const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-        const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-        const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-        const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-        // dct_const_round_shift
-        const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-        const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-        const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-        const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-        const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-        const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-        const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-        const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-        const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-        const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-        const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-        const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-        const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-        const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-        const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-        const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-        const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-        const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-        // Combine
-        step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-        step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-        step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-        step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-        step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-        step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-        step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-        step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+        {
+          const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+          const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+          const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+          const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+          const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+          const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+          const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+          const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+          const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+          // Combine
+          step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+          step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
-                                           &step1[21], &step1[26], &step1[27],
-                                           &step1[28], &step1[29]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
+          overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Stage 5
-      {
-        step2[4] = ADD_EPI16(step1[5], step3[4]);
-        step2[5] = SUB_EPI16(step3[4], step1[5]);
-        step2[6] = SUB_EPI16(step3[7], step1[6]);
-        step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
-                                           &step2[6], &step2[7]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
         }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-        const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-        const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-        const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-        const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-        const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-        const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-        const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-        const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-        const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-        const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-        const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-        // dct_const_round_shift
-        const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-        const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-        const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-        const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-        const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-        const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-        const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-        const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-        // Combine
-        out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
-        out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-        out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
-        out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+        {
+          const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+          const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+          const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+          const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+          const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+          const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+          const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+          const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+          const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+          const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+          const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+          const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+          const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+          const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+          const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+          const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+          const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+          const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+          const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+          const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+          const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+          const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+          const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+          const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+          // dct_const_round_shift
+          const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+          const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+          const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+          const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+          const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+          const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+          const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+          const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+          const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+          const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+          const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+          const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+          const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+          const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+          const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+          const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+          const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+          const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+          // Combine
+          step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+          step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+          step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+          step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+          step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+          step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+          step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+          step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&out[0], &out[16],
-                                           &out[8], &out[24]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
+          overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+                                             &step1[21], &step1[26], &step1[27],
+                                             &step1[28], &step1[29]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
-        const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
-        const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-        const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-        const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-        const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-        const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-        const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-        const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-        const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-        const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-        const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-        // dct_const_round_shift
-        const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-        const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-        const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-        const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-        const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-        const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-        const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-        const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-        // Combine
-        step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-        step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-        step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-        step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
-                                           &step2[13], &step2[14]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
         }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        step2[16] = ADD_EPI16(step1[19], step3[16]);
-        step2[17] = ADD_EPI16(step1[18], step3[17]);
-        step2[18] = SUB_EPI16(step3[17], step1[18]);
-        step2[19] = SUB_EPI16(step3[16], step1[19]);
-        step2[20] = SUB_EPI16(step3[23], step1[20]);
-        step2[21] = SUB_EPI16(step3[22], step1[21]);
-        step2[22] = ADD_EPI16(step1[21], step3[22]);
-        step2[23] = ADD_EPI16(step1[20], step3[23]);
-        step2[24] = ADD_EPI16(step1[27], step3[24]);
-        step2[25] = ADD_EPI16(step1[26], step3[25]);
-        step2[26] = SUB_EPI16(step3[25], step1[26]);
-        step2[27] = SUB_EPI16(step3[24], step1[27]);
-        step2[28] = SUB_EPI16(step3[31], step1[28]);
-        step2[29] = SUB_EPI16(step3[30], step1[29]);
-        step2[30] = ADD_EPI16(step1[29], step3[30]);
-        step2[31] = ADD_EPI16(step1[28], step3[31]);
+        // Stage 5
+        {
+          step2[4] = ADD_EPI16(step1[5], step3[4]);
+          step2[5] = SUB_EPI16(step3[4], step1[5]);
+          step2[6] = SUB_EPI16(step3[7], step1[6]);
+          step2[7] = ADD_EPI16(step1[6], step3[7]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step2[16], &step2[17], &step2[18], &step2[19],
-            &step2[20], &step2[21], &step2[22], &step2[23],
-            &step2[24], &step2[25], &step2[26], &step2[27],
-            &step2[28], &step2[29], &step2[30], &step2[31]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
+          overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
+                                             &step2[7]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Stage 6
-      {
-        const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-        const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-        const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-        const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-        const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-        const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-        const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-        const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-        const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-        const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-        const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-        const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-        const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-        const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-        const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-        const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-        // dct_const_round_shift
-        const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-        const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-        const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-        const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-        const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-        const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-        const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-        const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-        // Combine
-        out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-        out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-        out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-        out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&out[4], &out[20],
-                                           &out[12], &out[28]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
         }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
-        step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
-        step3[10] = SUB_EPI16(step1[11], step2[10]);
-        step3[11] = ADD_EPI16(step2[10], step1[11]);
-        step3[12] = ADD_EPI16(step2[13], step1[12]);
-        step3[13] = SUB_EPI16(step1[12], step2[13]);
-        step3[14] = SUB_EPI16(step1[15], step2[14]);
-        step3[15] = ADD_EPI16(step2[14], step1[15]);
+        {
+          const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+          const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+          const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+          const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+          const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+          const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+          const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+          const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+          const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+          const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+          const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+          const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i out_00_4 =
+              _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_00_5 =
+              _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_16_4 =
+              _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_16_5 =
+              _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_08_4 =
+              _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_08_5 =
+              _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_24_4 =
+              _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_24_5 =
+              _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+          const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+          const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+          const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+          const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+          const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+          const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+          const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+          // Combine
+          out[0] = _mm_packs_epi32(out_00_6, out_00_7);
+          out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+          out[8] = _mm_packs_epi32(out_08_6, out_08_7);
+          out[24] = _mm_packs_epi32(out_24_6, out_24_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
-                                           &step3[11], &step3[12], &step3[13],
-                                           &step3[14], &step3[15]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
+          overflow =
+              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-        const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-        const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-        const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-        const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-        const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-        const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-        const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-        const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-        const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-        const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-        const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-        const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-        const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-        const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-        const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-        const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-        const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-        const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-        const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-        const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-        const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-        const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-        const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-        // dct_const_round_shift
-        const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-        const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-        const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-        const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-        const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-        const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-        const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-        const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-        const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-        const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-        const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-        const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-        const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-        const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-        const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-        const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-        const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-        const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-        // Combine
-        step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-        step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-        step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-        step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-        // Combine
-        step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-        step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-        step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-        step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
-                                           &step3[22], &step3[25], &step3[26],
-                                           &step3[29], &step3[30]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
         }
+        {
+          const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
+          const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
+          const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+          const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+          const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+          const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+          const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+          const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+          const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+          const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+          const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+          const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+          // dct_const_round_shift
+          const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+          const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+          const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+          const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+          const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+          const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+          const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+          const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+          const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+          const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+          const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+          const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+          const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+          const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+          const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+          const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+          // Combine
+          step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+          step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+          step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+          step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
+                                             &step2[14]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Stage 7
-      {
-        const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
-        const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
-        const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
-        const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
-        const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-        const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-        const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-        const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-        const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-        const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-        const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-        const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-        const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-        const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-        const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-        const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-        const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-        const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-        const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-        const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-        const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-        const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-        const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-        const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-        // dct_const_round_shift
-        const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-        const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-        const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-        const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-        const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-        const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-        const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-        const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-        const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-        const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-        const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-        const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-        const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-        const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-        const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-        const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-        // Combine
-        out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
-        out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-        out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-        out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-        out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
-        out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-        out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-        out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+        }
+        {
+          step2[16] = ADD_EPI16(step1[19], step3[16]);
+          step2[17] = ADD_EPI16(step1[18], step3[17]);
+          step2[18] = SUB_EPI16(step3[17], step1[18]);
+          step2[19] = SUB_EPI16(step3[16], step1[19]);
+          step2[20] = SUB_EPI16(step3[23], step1[20]);
+          step2[21] = SUB_EPI16(step3[22], step1[21]);
+          step2[22] = ADD_EPI16(step1[21], step3[22]);
+          step2[23] = ADD_EPI16(step1[20], step3[23]);
+          step2[24] = ADD_EPI16(step1[27], step3[24]);
+          step2[25] = ADD_EPI16(step1[26], step3[25]);
+          step2[26] = SUB_EPI16(step3[25], step1[26]);
+          step2[27] = SUB_EPI16(step3[24], step1[27]);
+          step2[28] = SUB_EPI16(step3[31], step1[28]);
+          step2[29] = SUB_EPI16(step3[30], step1[29]);
+          step2[30] = ADD_EPI16(step1[29], step3[30]);
+          step2[31] = ADD_EPI16(step1[28], step3[31]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
-                                           &out[26], &out[6], &out[22],
-                                           &out[14], &out[30]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
+          overflow = check_epi16_overflow_x16(
+              &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
+              &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
+              &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
+              &step2[31]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
+        // Stage 6
+        {
+          const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+          const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+          const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+          const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+          const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+          const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+          const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+          const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+          const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+          const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+          const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+          const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+          const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+          const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+          const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+          const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+          // dct_const_round_shift
+          const __m128i out_04_4 =
+              _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_04_5 =
+              _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_20_4 =
+              _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_20_5 =
+              _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_12_4 =
+              _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_12_5 =
+              _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_28_4 =
+              _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_28_5 =
+              _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+          const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+          const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+          const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+          const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+          const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+          const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+          const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+          // Combine
+          out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+          out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+          out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+          out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow =
+              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        step1[16] = ADD_EPI16(step3[17], step2[16]);
-        step1[17] = SUB_EPI16(step2[16], step3[17]);
-        step1[18] = SUB_EPI16(step2[19], step3[18]);
-        step1[19] = ADD_EPI16(step3[18], step2[19]);
-        step1[20] = ADD_EPI16(step3[21], step2[20]);
-        step1[21] = SUB_EPI16(step2[20], step3[21]);
-        step1[22] = SUB_EPI16(step2[23], step3[22]);
-        step1[23] = ADD_EPI16(step3[22], step2[23]);
-        step1[24] = ADD_EPI16(step3[25], step2[24]);
-        step1[25] = SUB_EPI16(step2[24], step3[25]);
-        step1[26] = SUB_EPI16(step2[27], step3[26]);
-        step1[27] = ADD_EPI16(step3[26], step2[27]);
-        step1[28] = ADD_EPI16(step3[29], step2[28]);
-        step1[29] = SUB_EPI16(step2[28], step3[29]);
-        step1[30] = SUB_EPI16(step2[31], step3[30]);
-        step1[31] = ADD_EPI16(step3[30], step2[31]);
+        }
+        {
+          step3[8] = ADD_EPI16(step2[9], step1[8]);
+          step3[9] = SUB_EPI16(step1[8], step2[9]);
+          step3[10] = SUB_EPI16(step1[11], step2[10]);
+          step3[11] = ADD_EPI16(step2[10], step1[11]);
+          step3[12] = ADD_EPI16(step2[13], step1[12]);
+          step3[13] = SUB_EPI16(step1[12], step2[13]);
+          step3[14] = SUB_EPI16(step1[15], step2[14]);
+          step3[15] = ADD_EPI16(step2[14], step1[15]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step1[16], &step1[17], &step1[18], &step1[19],
-            &step1[20], &step1[21], &step1[22], &step1[23],
-            &step1[24], &step1[25], &step1[26], &step1[27],
-            &step1[28], &step1[29], &step1[30], &step1[31]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
+          overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+                                             &step3[11], &step3[12], &step3[13],
+                                             &step3[14], &step3[15]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
+        {
+          const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+          const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+          const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+          const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+          const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+          const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+          const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+          const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+          const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+          const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+          const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+          const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+          const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+          const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+          const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+          const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+          const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+          const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+          const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+          const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+          const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+          const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+          const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+          const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+          // dct_const_round_shift
+          const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+          const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+          const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+          const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+          const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+          const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+          const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+          const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+          const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+          const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+          const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+          const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+          const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+          const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+          const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+          const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+          const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+          const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+          // Combine
+          step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+          step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+          step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+          step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+          // Combine
+          step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+          step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+          step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+          step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+                                             &step3[22], &step3[25], &step3[26],
+                                             &step3[29], &step3[30]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Final stage --- outputs indices are bit-reversed.
-      {
-        const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-        const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-        const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-        const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-        const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-        const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-        const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-        const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-        const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-        const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-        const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-        const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-        const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-        const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-        const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-        const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-        const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-        const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-        const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-        const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-        const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-        const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-        const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-        const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-        // dct_const_round_shift
-        const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-        const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-        const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-        const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-        const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-        const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-        const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-        const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-        const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-        const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-        const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-        const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-        const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-        const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-        const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-        const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-        // Combine
-        out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
-        out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-        out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
-        out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-        out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
-        out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-        out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-        out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+        }
+        // Stage 7
+        {
+          const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
+          const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
+          const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
+          const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
+          const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+          const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+          const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+          const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+          const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+          const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+          const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+          const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+          const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+          const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+          const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+          const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+          const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+          const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+          const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+          const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+          const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+          const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+          const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+          const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+          // dct_const_round_shift
+          const __m128i out_02_4 =
+              _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_02_5 =
+              _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_18_4 =
+              _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_18_5 =
+              _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_10_4 =
+              _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_10_5 =
+              _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_26_4 =
+              _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_26_5 =
+              _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_06_4 =
+              _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_06_5 =
+              _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_22_4 =
+              _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_22_5 =
+              _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_14_4 =
+              _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_14_5 =
+              _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_30_4 =
+              _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_30_5 =
+              _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+          const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+          const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+          const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+          const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+          const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+          const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+          const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+          const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+          const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+          const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+          const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+          const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+          const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+          const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+          const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+          // Combine
+          out[2] = _mm_packs_epi32(out_02_6, out_02_7);
+          out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+          out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+          out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+          out[6] = _mm_packs_epi32(out_06_6, out_06_7);
+          out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+          out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+          out[30] = _mm_packs_epi32(out_30_6, out_30_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
-                                           &out[25], &out[7], &out[23],
-                                           &out[15], &out[31]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
+          overflow =
+              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+                                      &out[6], &out[22], &out[14], &out[30]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
+        {
+          step1[16] = ADD_EPI16(step3[17], step2[16]);
+          step1[17] = SUB_EPI16(step2[16], step3[17]);
+          step1[18] = SUB_EPI16(step2[19], step3[18]);
+          step1[19] = ADD_EPI16(step3[18], step2[19]);
+          step1[20] = ADD_EPI16(step3[21], step2[20]);
+          step1[21] = SUB_EPI16(step2[20], step3[21]);
+          step1[22] = SUB_EPI16(step2[23], step3[22]);
+          step1[23] = ADD_EPI16(step3[22], step2[23]);
+          step1[24] = ADD_EPI16(step3[25], step2[24]);
+          step1[25] = SUB_EPI16(step2[24], step3[25]);
+          step1[26] = SUB_EPI16(step2[27], step3[26]);
+          step1[27] = ADD_EPI16(step3[26], step2[27]);
+          step1[28] = ADD_EPI16(step3[29], step2[28]);
+          step1[29] = SUB_EPI16(step2[28], step3[29]);
+          step1[30] = SUB_EPI16(step2[31], step3[30]);
+          step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x16(
+              &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
+              &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
+              &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
+              &step1[31]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-        const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-        const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-        const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-        const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-        const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-        const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-        const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-        const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-        const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-        const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-        const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-        const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-        const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-        const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-        const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-        const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-        const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-        const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-        const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-        const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-        const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-        const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-        const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-        // dct_const_round_shift
-        const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-        const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-        const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-        const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-        const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-        const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-        const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-        const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-        const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-        const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-        const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-        const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-        const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-        const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-        const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-        const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-        // Combine
-        out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
-        out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-        out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-        out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-        out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
-        out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-        out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-        out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+        }
+        // Final stage --- outputs indices are bit-reversed.
+        {
+          const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+          const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+          const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+          const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+          const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+          const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+          const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+          const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+          const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+          const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+          const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+          const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+          const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+          const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+          const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+          const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+          const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+          const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+          const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+          const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+          const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+          const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+          const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+          const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+          // dct_const_round_shift
+          const __m128i out_01_4 =
+              _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_01_5 =
+              _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_17_4 =
+              _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_17_5 =
+              _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_09_4 =
+              _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_09_5 =
+              _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_25_4 =
+              _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_25_5 =
+              _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_07_4 =
+              _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_07_5 =
+              _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_23_4 =
+              _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_23_5 =
+              _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_15_4 =
+              _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_15_5 =
+              _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_31_4 =
+              _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_31_5 =
+              _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+          const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+          const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+          const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+          const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+          const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+          const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+          const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+          const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+          const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+          const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+          const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+          const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+          const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+          const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+          const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+          // Combine
+          out[1] = _mm_packs_epi32(out_01_6, out_01_7);
+          out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+          out[9] = _mm_packs_epi32(out_09_6, out_09_7);
+          out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+          out[7] = _mm_packs_epi32(out_07_6, out_07_7);
+          out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+          out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+          out[31] = _mm_packs_epi32(out_31_6, out_31_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
-                                           &out[29], &out[3], &out[19],
-                                           &out[11], &out[27]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
+          overflow =
+              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+                                      &out[7], &out[23], &out[15], &out[31]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
+        {
+          const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+          const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+          const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+          const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+          const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+          const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+          const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+          const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+          const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+          const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+          const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+          const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+          const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+          const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+          const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+          const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+          const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+          const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+          const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+          const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+          const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+          const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+          const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+          const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+          // dct_const_round_shift
+          const __m128i out_05_4 =
+              _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_05_5 =
+              _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_21_4 =
+              _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_21_5 =
+              _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_13_4 =
+              _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_13_5 =
+              _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_29_4 =
+              _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_29_5 =
+              _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_03_4 =
+              _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_03_5 =
+              _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_19_4 =
+              _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_19_5 =
+              _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_11_4 =
+              _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_11_5 =
+              _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_27_4 =
+              _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+          const __m128i out_27_5 =
+              _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+          const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+          const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+          const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+          const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+          const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+          const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+          const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+          const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+          const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+          const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+          const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+          const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+          const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+          const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+          const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+          const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+          // Combine
+          out[5] = _mm_packs_epi32(out_05_6, out_05_7);
+          out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+          out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+          out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+          out[3] = _mm_packs_epi32(out_03_6, out_03_7);
+          out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+          out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+          out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow =
+              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+                                      &out[3], &out[19], &out[11], &out[27]);
+          if (overflow) {
+            if (pass == 0)
+              HIGH_FDCT32x32_2D_C(input, output_org, stride);
+            else
+              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-      }
+        }
 #if FDCT32x32_HIGH_PRECISION
       } else {
         __m128i lstep1[64], lstep2[64], lstep3[64];
@@ -1457,32 +1512,32 @@ void FDCT32x32_2D(const int16_t *input,
         // stage 3
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
-          lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
-          lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
-          lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
-          lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
-          lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
-          lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
-          lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
-          lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
-          lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
-          lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
-          lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
-          lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
-          lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
-          lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
-          lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
-          lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
-          lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
-          lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
-          lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
+          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
+          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
+          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
+          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
+          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
+          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
+          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
+          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
+          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
+          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
+          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
+          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
+          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
+          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
+          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
+          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
+          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
+          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
+          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
+          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
+          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
+          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
+          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
+          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
+          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
           lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
           lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
           lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
@@ -1490,22 +1545,22 @@ void FDCT32x32_2D(const int16_t *input,
           lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
           lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
 
-          lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
-          lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
-          lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
-          lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
-          lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
-          lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
-          lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
-          lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
-          lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
-          lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
+          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
+          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
+          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
+          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
+          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
+          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
+          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
+          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
+          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
+          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
+          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
+          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
+          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
+          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
+          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
         }
         {
           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@@ -1643,10 +1698,10 @@ void FDCT32x32_2D(const int16_t *input,
         // stage 4
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
+          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
+          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
+          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
           lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
           lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
           lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
@@ -1660,14 +1715,14 @@ void FDCT32x32_2D(const int16_t *input,
           lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
           lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
 
-          lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
-          lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
-          lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
-          lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
-          lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
-          lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
-          lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
-          lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
+          lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
+          lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
+          lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
+          lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
+          lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
+          lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
+          lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
           lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
           lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
           lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
@@ -1686,64 +1741,64 @@ void FDCT32x32_2D(const int16_t *input,
           lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
         }
         {
-        // to be continued...
-        //
-        const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-        const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
-        u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-        u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-        u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-        u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-        // TODO(jingning): manually inline k_madd_epi32_ to further hide
-        // instruction latency.
-        v[0] = k_madd_epi32(u[0], k32_p16_m16);
-        v[1] = k_madd_epi32(u[1], k32_p16_m16);
-        v[2] = k_madd_epi32(u[2], k32_p16_m16);
-        v[3] = k_madd_epi32(u[3], k32_p16_m16);
-        v[4] = k_madd_epi32(u[0], k32_p16_p16);
-        v[5] = k_madd_epi32(u[1], k32_p16_p16);
-        v[6] = k_madd_epi32(u[2], k32_p16_p16);
-        v[7] = k_madd_epi32(u[3], k32_p16_p16);
+          // to be continued...
+          //
+          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+          // TODO(jingning): manually inline k_madd_epi32_ to further hide
+          // instruction latency.
+          v[0] = k_madd_epi32(u[0], k32_p16_m16);
+          v[1] = k_madd_epi32(u[1], k32_p16_m16);
+          v[2] = k_madd_epi32(u[2], k32_p16_m16);
+          v[3] = k_madd_epi32(u[3], k32_p16_m16);
+          v[4] = k_madd_epi32(u[0], k32_p16_p16);
+          v[5] = k_madd_epi32(u[1], k32_p16_p16);
+          v[6] = k_madd_epi32(u[2], k32_p16_p16);
+          v[7] = k_madd_epi32(u[3], k32_p16_p16);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
-                                            &v[4], &v[5], &v[6], &v[7], &kZero);
-        if (overflow) {
-          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
+          overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
+                                              &v[5], &v[6], &v[7], &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
 #endif  // DCT_HIGH_BIT_DEPTH
-        u[0] = k_packs_epi64(v[0], v[1]);
-        u[1] = k_packs_epi64(v[2], v[3]);
-        u[2] = k_packs_epi64(v[4], v[5]);
-        u[3] = k_packs_epi64(v[6], v[7]);
-
-        v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-        v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-        v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-        v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-        lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-        lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-        lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-        lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
         }
         {
           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
           const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
 
-          u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
           u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
           u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
           u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
@@ -1751,16 +1806,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
           u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
 
-          v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
-          v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
-          v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
-          v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
-          v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
-          v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
-          v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
-          v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
-          v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
-          v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+          v[0] = k_madd_epi32(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32(u[4], k32_m08_p24);
+          v[5] = k_madd_epi32(u[5], k32_m08_p24);
+          v[6] = k_madd_epi32(u[6], k32_m08_p24);
+          v[7] = k_madd_epi32(u[7], k32_m08_p24);
+          v[8] = k_madd_epi32(u[8], k32_m24_m08);
+          v[9] = k_madd_epi32(u[9], k32_m24_m08);
           v[10] = k_madd_epi32(u[10], k32_m24_m08);
           v[11] = k_madd_epi32(u[11], k32_m24_m08);
           v[12] = k_madd_epi32(u[12], k32_m24_m08);
@@ -1771,41 +1826,40 @@ void FDCT32x32_2D(const int16_t *input,
           v[17] = k_madd_epi32(u[13], k32_m08_p24);
           v[18] = k_madd_epi32(u[14], k32_m08_p24);
           v[19] = k_madd_epi32(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
-          v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+          v[20] = k_madd_epi32(u[8], k32_m08_p24);
+          v[21] = k_madd_epi32(u[9], k32_m08_p24);
           v[22] = k_madd_epi32(u[10], k32_m08_p24);
           v[23] = k_madd_epi32(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
-          v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
-          v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
-          v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
-          v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
-          v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
-          v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
-          v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+          v[24] = k_madd_epi32(u[4], k32_p24_p08);
+          v[25] = k_madd_epi32(u[5], k32_p24_p08);
+          v[26] = k_madd_epi32(u[6], k32_p24_p08);
+          v[27] = k_madd_epi32(u[7], k32_p24_p08);
+          v[28] = k_madd_epi32(u[0], k32_p24_p08);
+          v[29] = k_madd_epi32(u[1], k32_p24_p08);
+          v[30] = k_madd_epi32(u[2], k32_p24_p08);
+          v[31] = k_madd_epi32(u[3], k32_p24_p08);
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
-              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
-          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64(v[10], v[11]);
-          u[ 6] = k_packs_epi64(v[12], v[13]);
-          u[ 7] = k_packs_epi64(v[14], v[15]);
-          u[ 8] = k_packs_epi64(v[16], v[17]);
-          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+          u[8] = k_packs_epi64(v[16], v[17]);
+          u[9] = k_packs_epi64(v[18], v[19]);
           u[10] = k_packs_epi64(v[20], v[21]);
           u[11] = k_packs_epi64(v[22], v[23]);
           u[12] = k_packs_epi64(v[24], v[25]);
@@ -1813,16 +1867,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = k_packs_epi64(v[28], v[29]);
           u[15] = k_packs_epi64(v[30], v[31]);
 
-          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -1830,16 +1884,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
-          lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
-          lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
-          lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
-          lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
-          lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
-          lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
-          lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
-          lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
-          lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+          lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+          lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
           lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
           lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
           lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -1849,10 +1903,10 @@ void FDCT32x32_2D(const int16_t *input,
         }
         // stage 5
         {
-          lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
-          lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
-          lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
-          lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
+          lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
+          lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
+          lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
           lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
           lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
           lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
@@ -1875,16 +1929,16 @@ void FDCT32x32_2D(const int16_t *input,
 
           // TODO(jingning): manually inline k_madd_epi32_ to further hide
           // instruction latency.
-          v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
-          v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
-          v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
-          v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
-          v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
-          v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
-          v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
-          v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
-          v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
-          v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
+          v[0] = k_madd_epi32(u[0], k32_p16_p16);
+          v[1] = k_madd_epi32(u[1], k32_p16_p16);
+          v[2] = k_madd_epi32(u[2], k32_p16_p16);
+          v[3] = k_madd_epi32(u[3], k32_p16_p16);
+          v[4] = k_madd_epi32(u[0], k32_p16_m16);
+          v[5] = k_madd_epi32(u[1], k32_p16_m16);
+          v[6] = k_madd_epi32(u[2], k32_p16_m16);
+          v[7] = k_madd_epi32(u[3], k32_p16_m16);
+          v[8] = k_madd_epi32(u[4], k32_p24_p08);
+          v[9] = k_madd_epi32(u[5], k32_p24_p08);
           v[10] = k_madd_epi32(u[6], k32_p24_p08);
           v[11] = k_madd_epi32(u[7], k32_p24_p08);
           v[12] = k_madd_epi32(u[4], k32_m08_p24);
@@ -1894,9 +1948,8 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -1966,13 +2019,13 @@ void FDCT32x32_2D(const int16_t *input,
           u[7] = _mm_srai_epi32(u[7], 2);
 
           // Combine
-          out[ 0] = _mm_packs_epi32(u[0], u[1]);
+          out[0] = _mm_packs_epi32(u[0], u[1]);
           out[16] = _mm_packs_epi32(u[2], u[3]);
-          out[ 8] = _mm_packs_epi32(u[4], u[5]);
+          out[8] = _mm_packs_epi32(u[4], u[5]);
           out[24] = _mm_packs_epi32(u[6], u[7]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&out[0], &out[16],
-                                             &out[8], &out[24]);
+          overflow =
+              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2001,8 +2054,8 @@ void FDCT32x32_2D(const int16_t *input,
           v[5] = k_madd_epi32(u[5], k32_m24_m08);
           v[6] = k_madd_epi32(u[6], k32_m24_m08);
           v[7] = k_madd_epi32(u[7], k32_m24_m08);
-          v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
-          v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
+          v[8] = k_madd_epi32(u[4], k32_m08_p24);
+          v[9] = k_madd_epi32(u[5], k32_m08_p24);
           v[10] = k_madd_epi32(u[6], k32_m08_p24);
           v[11] = k_madd_epi32(u[7], k32_m08_p24);
           v[12] = k_madd_epi32(u[0], k32_p24_p08);
@@ -2012,9 +2065,8 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2088,10 +2140,10 @@ void FDCT32x32_2D(const int16_t *input,
           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
 
-          u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
-          u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
-          u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
-          u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+          u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+          u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+          u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
           u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
           u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
           u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
@@ -2100,10 +2152,10 @@ void FDCT32x32_2D(const int16_t *input,
           u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
           u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
           u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
-          u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
-          u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
-          u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
+          u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
+          u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
+          u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
 
           v[0] = k_madd_epi32(u[0], k32_p28_p04);
           v[1] = k_madd_epi32(u[1], k32_p28_p04);
@@ -2113,8 +2165,8 @@ void FDCT32x32_2D(const int16_t *input,
           v[5] = k_madd_epi32(u[5], k32_p12_p20);
           v[6] = k_madd_epi32(u[6], k32_p12_p20);
           v[7] = k_madd_epi32(u[7], k32_p12_p20);
-          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
-          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[8] = k_madd_epi32(u[8], k32_m20_p12);
+          v[9] = k_madd_epi32(u[9], k32_m20_p12);
           v[10] = k_madd_epi32(u[10], k32_m20_p12);
           v[11] = k_madd_epi32(u[11], k32_m20_p12);
           v[12] = k_madd_epi32(u[12], k32_m04_p28);
@@ -2124,9 +2176,8 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2195,13 +2246,13 @@ void FDCT32x32_2D(const int16_t *input,
           u[6] = _mm_srai_epi32(u[6], 2);
           u[7] = _mm_srai_epi32(u[7], 2);
 
-          out[ 4] = _mm_packs_epi32(u[0], u[1]);
+          out[4] = _mm_packs_epi32(u[0], u[1]);
           out[20] = _mm_packs_epi32(u[2], u[3]);
           out[12] = _mm_packs_epi32(u[4], u[5]);
           out[28] = _mm_packs_epi32(u[6], u[7]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&out[4], &out[20],
-                                             &out[12], &out[28]);
+          overflow =
+              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2230,21 +2281,21 @@ void FDCT32x32_2D(const int16_t *input,
           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
           const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
-                                                     -cospi_20_64);
+          const __m128i k32_m12_m20 =
+              pair_set_epi32(-cospi_12_64, -cospi_20_64);
           const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
           const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
 
-          u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
           u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
           u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
           u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
@@ -2252,16 +2303,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
           u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
 
-          v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
-          v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
-          v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
-          v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
-          v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
-          v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
-          v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
-          v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
-          v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
-          v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
+          v[0] = k_madd_epi32(u[0], k32_m04_p28);
+          v[1] = k_madd_epi32(u[1], k32_m04_p28);
+          v[2] = k_madd_epi32(u[2], k32_m04_p28);
+          v[3] = k_madd_epi32(u[3], k32_m04_p28);
+          v[4] = k_madd_epi32(u[4], k32_m28_m04);
+          v[5] = k_madd_epi32(u[5], k32_m28_m04);
+          v[6] = k_madd_epi32(u[6], k32_m28_m04);
+          v[7] = k_madd_epi32(u[7], k32_m28_m04);
+          v[8] = k_madd_epi32(u[8], k32_m20_p12);
+          v[9] = k_madd_epi32(u[9], k32_m20_p12);
           v[10] = k_madd_epi32(u[10], k32_m20_p12);
           v[11] = k_madd_epi32(u[11], k32_m20_p12);
           v[12] = k_madd_epi32(u[12], k32_m12_m20);
@@ -2272,41 +2323,40 @@ void FDCT32x32_2D(const int16_t *input,
           v[17] = k_madd_epi32(u[13], k32_m20_p12);
           v[18] = k_madd_epi32(u[14], k32_m20_p12);
           v[19] = k_madd_epi32(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
-          v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
+          v[20] = k_madd_epi32(u[8], k32_p12_p20);
+          v[21] = k_madd_epi32(u[9], k32_p12_p20);
           v[22] = k_madd_epi32(u[10], k32_p12_p20);
           v[23] = k_madd_epi32(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
-          v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
-          v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
-          v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
-          v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
-          v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
-          v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
-          v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
+          v[24] = k_madd_epi32(u[4], k32_m04_p28);
+          v[25] = k_madd_epi32(u[5], k32_m04_p28);
+          v[26] = k_madd_epi32(u[6], k32_m04_p28);
+          v[27] = k_madd_epi32(u[7], k32_m04_p28);
+          v[28] = k_madd_epi32(u[0], k32_p28_p04);
+          v[29] = k_madd_epi32(u[1], k32_p28_p04);
+          v[30] = k_madd_epi32(u[2], k32_p28_p04);
+          v[31] = k_madd_epi32(u[3], k32_p28_p04);
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
-              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
-          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64(v[10], v[11]);
-          u[ 6] = k_packs_epi64(v[12], v[13]);
-          u[ 7] = k_packs_epi64(v[14], v[15]);
-          u[ 8] = k_packs_epi64(v[16], v[17]);
-          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+          u[8] = k_packs_epi64(v[16], v[17]);
+          u[9] = k_packs_epi64(v[18], v[19]);
           u[10] = k_packs_epi64(v[20], v[21]);
           u[11] = k_packs_epi64(v[22], v[23]);
           u[12] = k_packs_epi64(v[24], v[25]);
@@ -2314,16 +2364,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = k_packs_epi64(v[28], v[29]);
           u[15] = k_packs_epi64(v[30], v[31]);
 
-          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2331,16 +2381,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
-          lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
-          lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
-          lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
-          lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
-          lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
-          lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
-          lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
-          lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
-          lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+          lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+          lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
           lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
           lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
           lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2353,22 +2403,22 @@ void FDCT32x32_2D(const int16_t *input,
           const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
           const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
           const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
-          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
           const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
           const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
           const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
           const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
 
-          u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
           u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
           u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
           u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
@@ -2376,16 +2426,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
           u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
 
-          v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
-          v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
-          v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
-          v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
-          v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
-          v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
-          v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
-          v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
-          v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
-          v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
+          v[0] = k_madd_epi32(u[0], k32_p30_p02);
+          v[1] = k_madd_epi32(u[1], k32_p30_p02);
+          v[2] = k_madd_epi32(u[2], k32_p30_p02);
+          v[3] = k_madd_epi32(u[3], k32_p30_p02);
+          v[4] = k_madd_epi32(u[4], k32_p14_p18);
+          v[5] = k_madd_epi32(u[5], k32_p14_p18);
+          v[6] = k_madd_epi32(u[6], k32_p14_p18);
+          v[7] = k_madd_epi32(u[7], k32_p14_p18);
+          v[8] = k_madd_epi32(u[8], k32_p22_p10);
+          v[9] = k_madd_epi32(u[9], k32_p22_p10);
           v[10] = k_madd_epi32(u[10], k32_p22_p10);
           v[11] = k_madd_epi32(u[11], k32_p22_p10);
           v[12] = k_madd_epi32(u[12], k32_p06_p26);
@@ -2396,41 +2446,40 @@ void FDCT32x32_2D(const int16_t *input,
           v[17] = k_madd_epi32(u[13], k32_m26_p06);
           v[18] = k_madd_epi32(u[14], k32_m26_p06);
           v[19] = k_madd_epi32(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
-          v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
+          v[20] = k_madd_epi32(u[8], k32_m10_p22);
+          v[21] = k_madd_epi32(u[9], k32_m10_p22);
           v[22] = k_madd_epi32(u[10], k32_m10_p22);
           v[23] = k_madd_epi32(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
-          v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
-          v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
-          v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
-          v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
-          v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
-          v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
-          v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
+          v[24] = k_madd_epi32(u[4], k32_m18_p14);
+          v[25] = k_madd_epi32(u[5], k32_m18_p14);
+          v[26] = k_madd_epi32(u[6], k32_m18_p14);
+          v[27] = k_madd_epi32(u[7], k32_m18_p14);
+          v[28] = k_madd_epi32(u[0], k32_m02_p30);
+          v[29] = k_madd_epi32(u[1], k32_m02_p30);
+          v[30] = k_madd_epi32(u[2], k32_m02_p30);
+          v[31] = k_madd_epi32(u[3], k32_m02_p30);
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
-              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
-          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64(v[10], v[11]);
-          u[ 6] = k_packs_epi64(v[12], v[13]);
-          u[ 7] = k_packs_epi64(v[14], v[15]);
-          u[ 8] = k_packs_epi64(v[16], v[17]);
-          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+          u[8] = k_packs_epi64(v[16], v[17]);
+          u[9] = k_packs_epi64(v[18], v[19]);
           u[10] = k_packs_epi64(v[20], v[21]);
           u[11] = k_packs_epi64(v[22], v[23]);
           u[12] = k_packs_epi64(v[24], v[25]);
@@ -2438,16 +2487,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = k_packs_epi64(v[28], v[29]);
           u[15] = k_packs_epi64(v[30], v[31]);
 
-          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2455,16 +2504,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
-          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
-          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
-          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
-          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
-          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
-          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
-          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
-          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
-          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2472,16 +2521,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
 
-          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
-          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
-          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
-          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
-          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
-          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
-          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
-          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
-          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
-          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[0] = _mm_cmplt_epi32(u[0], kZero);
+          v[1] = _mm_cmplt_epi32(u[1], kZero);
+          v[2] = _mm_cmplt_epi32(u[2], kZero);
+          v[3] = _mm_cmplt_epi32(u[3], kZero);
+          v[4] = _mm_cmplt_epi32(u[4], kZero);
+          v[5] = _mm_cmplt_epi32(u[5], kZero);
+          v[6] = _mm_cmplt_epi32(u[6], kZero);
+          v[7] = _mm_cmplt_epi32(u[7], kZero);
+          v[8] = _mm_cmplt_epi32(u[8], kZero);
+          v[9] = _mm_cmplt_epi32(u[9], kZero);
           v[10] = _mm_cmplt_epi32(u[10], kZero);
           v[11] = _mm_cmplt_epi32(u[11], kZero);
           v[12] = _mm_cmplt_epi32(u[12], kZero);
@@ -2489,16 +2538,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_cmplt_epi32(u[14], kZero);
           v[15] = _mm_cmplt_epi32(u[15], kZero);
 
-          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
-          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
-          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
-          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
-          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
-          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
-          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
-          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
-          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
-          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[0] = _mm_sub_epi32(u[0], v[0]);
+          u[1] = _mm_sub_epi32(u[1], v[1]);
+          u[2] = _mm_sub_epi32(u[2], v[2]);
+          u[3] = _mm_sub_epi32(u[3], v[3]);
+          u[4] = _mm_sub_epi32(u[4], v[4]);
+          u[5] = _mm_sub_epi32(u[5], v[5]);
+          u[6] = _mm_sub_epi32(u[6], v[6]);
+          u[7] = _mm_sub_epi32(u[7], v[7]);
+          u[8] = _mm_sub_epi32(u[8], v[8]);
+          u[9] = _mm_sub_epi32(u[9], v[9]);
           u[10] = _mm_sub_epi32(u[10], v[10]);
           u[11] = _mm_sub_epi32(u[11], v[11]);
           u[12] = _mm_sub_epi32(u[12], v[12]);
@@ -2506,16 +2555,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_sub_epi32(u[14], v[14]);
           u[15] = _mm_sub_epi32(u[15], v[15]);
 
-          v[ 0] = _mm_add_epi32(u[ 0], K32One);
-          v[ 1] = _mm_add_epi32(u[ 1], K32One);
-          v[ 2] = _mm_add_epi32(u[ 2], K32One);
-          v[ 3] = _mm_add_epi32(u[ 3], K32One);
-          v[ 4] = _mm_add_epi32(u[ 4], K32One);
-          v[ 5] = _mm_add_epi32(u[ 5], K32One);
-          v[ 6] = _mm_add_epi32(u[ 6], K32One);
-          v[ 7] = _mm_add_epi32(u[ 7], K32One);
-          v[ 8] = _mm_add_epi32(u[ 8], K32One);
-          v[ 9] = _mm_add_epi32(u[ 9], K32One);
+          v[0] = _mm_add_epi32(u[0], K32One);
+          v[1] = _mm_add_epi32(u[1], K32One);
+          v[2] = _mm_add_epi32(u[2], K32One);
+          v[3] = _mm_add_epi32(u[3], K32One);
+          v[4] = _mm_add_epi32(u[4], K32One);
+          v[5] = _mm_add_epi32(u[5], K32One);
+          v[6] = _mm_add_epi32(u[6], K32One);
+          v[7] = _mm_add_epi32(u[7], K32One);
+          v[8] = _mm_add_epi32(u[8], K32One);
+          v[9] = _mm_add_epi32(u[9], K32One);
           v[10] = _mm_add_epi32(u[10], K32One);
           v[11] = _mm_add_epi32(u[11], K32One);
           v[12] = _mm_add_epi32(u[12], K32One);
@@ -2523,16 +2572,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_add_epi32(u[14], K32One);
           v[15] = _mm_add_epi32(u[15], K32One);
 
-          u[ 0] = _mm_srai_epi32(v[ 0], 2);
-          u[ 1] = _mm_srai_epi32(v[ 1], 2);
-          u[ 2] = _mm_srai_epi32(v[ 2], 2);
-          u[ 3] = _mm_srai_epi32(v[ 3], 2);
-          u[ 4] = _mm_srai_epi32(v[ 4], 2);
-          u[ 5] = _mm_srai_epi32(v[ 5], 2);
-          u[ 6] = _mm_srai_epi32(v[ 6], 2);
-          u[ 7] = _mm_srai_epi32(v[ 7], 2);
-          u[ 8] = _mm_srai_epi32(v[ 8], 2);
-          u[ 9] = _mm_srai_epi32(v[ 9], 2);
+          u[0] = _mm_srai_epi32(v[0], 2);
+          u[1] = _mm_srai_epi32(v[1], 2);
+          u[2] = _mm_srai_epi32(v[2], 2);
+          u[3] = _mm_srai_epi32(v[3], 2);
+          u[4] = _mm_srai_epi32(v[4], 2);
+          u[5] = _mm_srai_epi32(v[5], 2);
+          u[6] = _mm_srai_epi32(v[6], 2);
+          u[7] = _mm_srai_epi32(v[7], 2);
+          u[8] = _mm_srai_epi32(v[8], 2);
+          u[9] = _mm_srai_epi32(v[9], 2);
           u[10] = _mm_srai_epi32(v[10], 2);
           u[11] = _mm_srai_epi32(v[11], 2);
           u[12] = _mm_srai_epi32(v[12], 2);
@@ -2540,18 +2589,18 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_srai_epi32(v[14], 2);
           u[15] = _mm_srai_epi32(v[15], 2);
 
-          out[ 2] = _mm_packs_epi32(u[0], u[1]);
+          out[2] = _mm_packs_epi32(u[0], u[1]);
           out[18] = _mm_packs_epi32(u[2], u[3]);
           out[10] = _mm_packs_epi32(u[4], u[5]);
           out[26] = _mm_packs_epi32(u[6], u[7]);
-          out[ 6] = _mm_packs_epi32(u[8], u[9]);
+          out[6] = _mm_packs_epi32(u[8], u[9]);
           out[22] = _mm_packs_epi32(u[10], u[11]);
           out[14] = _mm_packs_epi32(u[12], u[13]);
           out[30] = _mm_packs_epi32(u[14], u[15]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
-                                             &out[26], &out[6], &out[22],
-                                             &out[14], &out[30]);
+          overflow =
+              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
+                                      &out[6], &out[22], &out[14], &out[30]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2603,16 +2652,16 @@ void FDCT32x32_2D(const int16_t *input,
           const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
           const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
 
-          u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
           u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
           u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
           u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
@@ -2620,16 +2669,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
           u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
 
-          v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
-          v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
-          v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
-          v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
-          v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
-          v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
-          v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
-          v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
-          v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
-          v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
+          v[0] = k_madd_epi32(u[0], k32_p31_p01);
+          v[1] = k_madd_epi32(u[1], k32_p31_p01);
+          v[2] = k_madd_epi32(u[2], k32_p31_p01);
+          v[3] = k_madd_epi32(u[3], k32_p31_p01);
+          v[4] = k_madd_epi32(u[4], k32_p15_p17);
+          v[5] = k_madd_epi32(u[5], k32_p15_p17);
+          v[6] = k_madd_epi32(u[6], k32_p15_p17);
+          v[7] = k_madd_epi32(u[7], k32_p15_p17);
+          v[8] = k_madd_epi32(u[8], k32_p23_p09);
+          v[9] = k_madd_epi32(u[9], k32_p23_p09);
           v[10] = k_madd_epi32(u[10], k32_p23_p09);
           v[11] = k_madd_epi32(u[11], k32_p23_p09);
           v[12] = k_madd_epi32(u[12], k32_p07_p25);
@@ -2640,41 +2689,40 @@ void FDCT32x32_2D(const int16_t *input,
           v[17] = k_madd_epi32(u[13], k32_m25_p07);
           v[18] = k_madd_epi32(u[14], k32_m25_p07);
           v[19] = k_madd_epi32(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
-          v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
+          v[20] = k_madd_epi32(u[8], k32_m09_p23);
+          v[21] = k_madd_epi32(u[9], k32_m09_p23);
           v[22] = k_madd_epi32(u[10], k32_m09_p23);
           v[23] = k_madd_epi32(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
-          v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
-          v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
-          v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
-          v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
-          v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
-          v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
-          v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
+          v[24] = k_madd_epi32(u[4], k32_m17_p15);
+          v[25] = k_madd_epi32(u[5], k32_m17_p15);
+          v[26] = k_madd_epi32(u[6], k32_m17_p15);
+          v[27] = k_madd_epi32(u[7], k32_m17_p15);
+          v[28] = k_madd_epi32(u[0], k32_m01_p31);
+          v[29] = k_madd_epi32(u[1], k32_m01_p31);
+          v[30] = k_madd_epi32(u[2], k32_m01_p31);
+          v[31] = k_madd_epi32(u[3], k32_m01_p31);
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
-              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
-          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64(v[10], v[11]);
-          u[ 6] = k_packs_epi64(v[12], v[13]);
-          u[ 7] = k_packs_epi64(v[14], v[15]);
-          u[ 8] = k_packs_epi64(v[16], v[17]);
-          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+          u[8] = k_packs_epi64(v[16], v[17]);
+          u[9] = k_packs_epi64(v[18], v[19]);
           u[10] = k_packs_epi64(v[20], v[21]);
           u[11] = k_packs_epi64(v[22], v[23]);
           u[12] = k_packs_epi64(v[24], v[25]);
@@ -2682,16 +2730,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = k_packs_epi64(v[28], v[29]);
           u[15] = k_packs_epi64(v[30], v[31]);
 
-          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2699,16 +2747,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
-          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
-          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
-          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
-          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
-          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
-          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
-          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
-          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
-          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2716,16 +2764,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
 
-          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
-          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
-          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
-          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
-          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
-          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
-          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
-          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
-          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
-          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[0] = _mm_cmplt_epi32(u[0], kZero);
+          v[1] = _mm_cmplt_epi32(u[1], kZero);
+          v[2] = _mm_cmplt_epi32(u[2], kZero);
+          v[3] = _mm_cmplt_epi32(u[3], kZero);
+          v[4] = _mm_cmplt_epi32(u[4], kZero);
+          v[5] = _mm_cmplt_epi32(u[5], kZero);
+          v[6] = _mm_cmplt_epi32(u[6], kZero);
+          v[7] = _mm_cmplt_epi32(u[7], kZero);
+          v[8] = _mm_cmplt_epi32(u[8], kZero);
+          v[9] = _mm_cmplt_epi32(u[9], kZero);
           v[10] = _mm_cmplt_epi32(u[10], kZero);
           v[11] = _mm_cmplt_epi32(u[11], kZero);
           v[12] = _mm_cmplt_epi32(u[12], kZero);
@@ -2733,16 +2781,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_cmplt_epi32(u[14], kZero);
           v[15] = _mm_cmplt_epi32(u[15], kZero);
 
-          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
-          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
-          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
-          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
-          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
-          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
-          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
-          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
-          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
-          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[0] = _mm_sub_epi32(u[0], v[0]);
+          u[1] = _mm_sub_epi32(u[1], v[1]);
+          u[2] = _mm_sub_epi32(u[2], v[2]);
+          u[3] = _mm_sub_epi32(u[3], v[3]);
+          u[4] = _mm_sub_epi32(u[4], v[4]);
+          u[5] = _mm_sub_epi32(u[5], v[5]);
+          u[6] = _mm_sub_epi32(u[6], v[6]);
+          u[7] = _mm_sub_epi32(u[7], v[7]);
+          u[8] = _mm_sub_epi32(u[8], v[8]);
+          u[9] = _mm_sub_epi32(u[9], v[9]);
           u[10] = _mm_sub_epi32(u[10], v[10]);
           u[11] = _mm_sub_epi32(u[11], v[11]);
           u[12] = _mm_sub_epi32(u[12], v[12]);
@@ -2784,18 +2832,18 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_srai_epi32(v[14], 2);
           u[15] = _mm_srai_epi32(v[15], 2);
 
-          out[ 1] = _mm_packs_epi32(u[0], u[1]);
+          out[1] = _mm_packs_epi32(u[0], u[1]);
           out[17] = _mm_packs_epi32(u[2], u[3]);
-          out[ 9] = _mm_packs_epi32(u[4], u[5]);
+          out[9] = _mm_packs_epi32(u[4], u[5]);
           out[25] = _mm_packs_epi32(u[6], u[7]);
-          out[ 7] = _mm_packs_epi32(u[8], u[9]);
+          out[7] = _mm_packs_epi32(u[8], u[9]);
           out[23] = _mm_packs_epi32(u[10], u[11]);
           out[15] = _mm_packs_epi32(u[12], u[13]);
           out[31] = _mm_packs_epi32(u[14], u[15]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
-                                             &out[25], &out[7], &out[23],
-                                             &out[15], &out[31]);
+          overflow =
+              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
+                                      &out[7], &out[23], &out[15], &out[31]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2812,16 +2860,16 @@ void FDCT32x32_2D(const int16_t *input,
           const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
           const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
 
-          u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
           u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
           u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
           u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
@@ -2829,16 +2877,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
           u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
 
-          v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
-          v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
-          v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
-          v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
-          v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
-          v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
-          v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
-          v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
-          v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
-          v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
+          v[0] = k_madd_epi32(u[0], k32_p27_p05);
+          v[1] = k_madd_epi32(u[1], k32_p27_p05);
+          v[2] = k_madd_epi32(u[2], k32_p27_p05);
+          v[3] = k_madd_epi32(u[3], k32_p27_p05);
+          v[4] = k_madd_epi32(u[4], k32_p11_p21);
+          v[5] = k_madd_epi32(u[5], k32_p11_p21);
+          v[6] = k_madd_epi32(u[6], k32_p11_p21);
+          v[7] = k_madd_epi32(u[7], k32_p11_p21);
+          v[8] = k_madd_epi32(u[8], k32_p19_p13);
+          v[9] = k_madd_epi32(u[9], k32_p19_p13);
           v[10] = k_madd_epi32(u[10], k32_p19_p13);
           v[11] = k_madd_epi32(u[11], k32_p19_p13);
           v[12] = k_madd_epi32(u[12], k32_p03_p29);
@@ -2849,41 +2897,40 @@ void FDCT32x32_2D(const int16_t *input,
           v[17] = k_madd_epi32(u[13], k32_m29_p03);
           v[18] = k_madd_epi32(u[14], k32_m29_p03);
           v[19] = k_madd_epi32(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
-          v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
+          v[20] = k_madd_epi32(u[8], k32_m13_p19);
+          v[21] = k_madd_epi32(u[9], k32_m13_p19);
           v[22] = k_madd_epi32(u[10], k32_m13_p19);
           v[23] = k_madd_epi32(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
-          v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
-          v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
-          v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
-          v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
-          v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
-          v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
-          v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
+          v[24] = k_madd_epi32(u[4], k32_m21_p11);
+          v[25] = k_madd_epi32(u[5], k32_m21_p11);
+          v[26] = k_madd_epi32(u[6], k32_m21_p11);
+          v[27] = k_madd_epi32(u[7], k32_m21_p11);
+          v[28] = k_madd_epi32(u[0], k32_m05_p27);
+          v[29] = k_madd_epi32(u[1], k32_m05_p27);
+          v[30] = k_madd_epi32(u[2], k32_m05_p27);
+          v[31] = k_madd_epi32(u[3], k32_m05_p27);
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
-              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
-              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
-              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
-              &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
+              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
+              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
+              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
-          u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
-          u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
-          u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
-          u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
-          u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
-          u[ 5] = k_packs_epi64(v[10], v[11]);
-          u[ 6] = k_packs_epi64(v[12], v[13]);
-          u[ 7] = k_packs_epi64(v[14], v[15]);
-          u[ 8] = k_packs_epi64(v[16], v[17]);
-          u[ 9] = k_packs_epi64(v[18], v[19]);
+          u[0] = k_packs_epi64(v[0], v[1]);
+          u[1] = k_packs_epi64(v[2], v[3]);
+          u[2] = k_packs_epi64(v[4], v[5]);
+          u[3] = k_packs_epi64(v[6], v[7]);
+          u[4] = k_packs_epi64(v[8], v[9]);
+          u[5] = k_packs_epi64(v[10], v[11]);
+          u[6] = k_packs_epi64(v[12], v[13]);
+          u[7] = k_packs_epi64(v[14], v[15]);
+          u[8] = k_packs_epi64(v[16], v[17]);
+          u[9] = k_packs_epi64(v[18], v[19]);
           u[10] = k_packs_epi64(v[20], v[21]);
           u[11] = k_packs_epi64(v[22], v[23]);
           u[12] = k_packs_epi64(v[24], v[25]);
@@ -2891,16 +2938,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = k_packs_epi64(v[28], v[29]);
           u[15] = k_packs_epi64(v[30], v[31]);
 
-          v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
-          v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
-          v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
-          v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
-          v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
-          v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
-          v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
-          v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
-          v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
-          v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
@@ -2908,16 +2955,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-          u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
-          u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
-          u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
-          u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
-          u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
-          u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
-          u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
-          u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
-          u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
-          u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
@@ -2925,16 +2972,16 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
 
-          v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
-          v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
-          v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
-          v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
-          v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
-          v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
-          v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
-          v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
-          v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
-          v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
+          v[0] = _mm_cmplt_epi32(u[0], kZero);
+          v[1] = _mm_cmplt_epi32(u[1], kZero);
+          v[2] = _mm_cmplt_epi32(u[2], kZero);
+          v[3] = _mm_cmplt_epi32(u[3], kZero);
+          v[4] = _mm_cmplt_epi32(u[4], kZero);
+          v[5] = _mm_cmplt_epi32(u[5], kZero);
+          v[6] = _mm_cmplt_epi32(u[6], kZero);
+          v[7] = _mm_cmplt_epi32(u[7], kZero);
+          v[8] = _mm_cmplt_epi32(u[8], kZero);
+          v[9] = _mm_cmplt_epi32(u[9], kZero);
           v[10] = _mm_cmplt_epi32(u[10], kZero);
           v[11] = _mm_cmplt_epi32(u[11], kZero);
           v[12] = _mm_cmplt_epi32(u[12], kZero);
@@ -2942,16 +2989,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = _mm_cmplt_epi32(u[14], kZero);
           v[15] = _mm_cmplt_epi32(u[15], kZero);
 
-          u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
-          u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
-          u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
-          u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
-          u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
-          u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
-          u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
-          u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
-          u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
-          u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
+          u[0] = _mm_sub_epi32(u[0], v[0]);
+          u[1] = _mm_sub_epi32(u[1], v[1]);
+          u[2] = _mm_sub_epi32(u[2], v[2]);
+          u[3] = _mm_sub_epi32(u[3], v[3]);
+          u[4] = _mm_sub_epi32(u[4], v[4]);
+          u[5] = _mm_sub_epi32(u[5], v[5]);
+          u[6] = _mm_sub_epi32(u[6], v[6]);
+          u[7] = _mm_sub_epi32(u[7], v[7]);
+          u[8] = _mm_sub_epi32(u[8], v[8]);
+          u[9] = _mm_sub_epi32(u[9], v[9]);
           u[10] = _mm_sub_epi32(u[10], v[10]);
           u[11] = _mm_sub_epi32(u[11], v[11]);
           u[12] = _mm_sub_epi32(u[12], v[12]);
@@ -2993,18 +3040,18 @@ void FDCT32x32_2D(const int16_t *input,
           u[14] = _mm_srai_epi32(v[14], 2);
           u[15] = _mm_srai_epi32(v[15], 2);
 
-          out[ 5] = _mm_packs_epi32(u[0], u[1]);
+          out[5] = _mm_packs_epi32(u[0], u[1]);
           out[21] = _mm_packs_epi32(u[2], u[3]);
           out[13] = _mm_packs_epi32(u[4], u[5]);
           out[29] = _mm_packs_epi32(u[6], u[7]);
-          out[ 3] = _mm_packs_epi32(u[8], u[9]);
+          out[3] = _mm_packs_epi32(u[8], u[9]);
           out[19] = _mm_packs_epi32(u[10], u[11]);
           out[11] = _mm_packs_epi32(u[12], u[13]);
           out[27] = _mm_packs_epi32(u[14], u[15]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
-                                             &out[29], &out[3], &out[19],
-                                             &out[11], &out[27]);
+          overflow =
+              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
+                                      &out[3], &out[19], &out[11], &out[27]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c
index 6d9da6aa89e62009c14218dc49346bdbaf7d105e..21f11f0c3e5ff05e27ac8d48c74cf1bae6ef2c7f 100644
--- a/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -13,11 +13,11 @@
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
-#undef  FDCT32x32_2D_AVX2
-#undef  FDCT32x32_HIGH_PRECISION
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
 
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2
 #define FDCT32x32_HIGH_PRECISION 1
-#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
-#undef  FDCT32x32_2D_AVX2
-#undef  FDCT32x32_HIGH_PRECISION
+#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
+#undef FDCT32x32_2D_AVX2
+#undef FDCT32x32_HIGH_PRECISION
diff --git a/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 69889e2e98cdf6d10788db303f3266fbf10a3718..743e55e635c5dc5d85b199c4a6ea46e15826f695 100644
--- a/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -43,44 +43,36 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   // These are the coefficients used for the multiplies.
   // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
   // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64,
-                                            -cospi_8_64, -cospi_24_64,
-                                            -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            -cospi_24_64, cospi_8_64,
-                                            -cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_A =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
 
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
-                                               +(DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 =  DCT_CONST_BITS + 2;
+  const __m128i k__DCT_CONST_ROUNDING2 =
+      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i in0, in1;
@@ -90,14 +82,14 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
 #endif
 
   // Load inputs.
-  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
-  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
-                                                 (input +  2 * stride)));
-  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
-                                                 (input +  3 * stride)));
-  // in0 = [i0 i1 i2 i3 iC iD iE iF]
-  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in1 = _mm_unpacklo_epi64(
+      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+  in0 = _mm_unpacklo_epi64(
+      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+// in0 = [i0 i1 i2 i3 iC iD iE iF]
+// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
 #if DCT_HIGH_BIT_DEPTH
   // Check inputs small enough to use optimised code
   cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
@@ -194,8 +186,8 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
 
     const __m128i t0 = ADD_EPI16(in0, in1);
     const __m128i t1 = SUB_EPI16(in0, in1);
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+// t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
+// t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
 #if DCT_HIGH_BIT_DEPTH
     overflow = check_epi16_overflow_x2(&t0, &t1);
     if (overflow) {
@@ -263,7 +255,6 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   storeu_output(&in1, output + 2 * 4);
 }
 
-
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // Constants
@@ -283,14 +274,14 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   int overflow;
 #endif
   // Load input
-  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
   // Pre-condition input (shift by two)
   in0 = _mm_slli_epi16(in0, 2);
   in1 = _mm_slli_epi16(in1, 2);
@@ -319,8 +310,8 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
     const __m128i q7 = SUB_EPI16(in0, in7);
 #if DCT_HIGH_BIT_DEPTH
     if (pass == 1) {
-      overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
-                                         &q4, &q5, &q6, &q7);
+      overflow =
+          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
       if (overflow) {
         vpx_highbd_fdct8x8_c(input, output, stride);
         return;
@@ -630,22 +621,22 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
       __m128i res08, res09, res10, res11, res12, res13, res14, res15;
       // Load and pre-condition input.
       if (0 == pass) {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
+        in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
+        in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
+        in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
+        in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
+        in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
+        in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
+        in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
+        in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
+        in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
+        in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
         // x = x << 2
         in00 = _mm_slli_epi16(in00, 2);
         in01 = _mm_slli_epi16(in01, 2);
@@ -664,22 +655,22 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         in14 = _mm_slli_epi16(in14, 2);
         in15 = _mm_slli_epi16(in15, 2);
       } else {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
+        in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
+        in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
+        in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
+        in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
+        in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
+        in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
+        in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
+        in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
+        in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
+        in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
         // x = (x + 1) >> 2
         in00 = _mm_add_epi16(in00, kOne);
         in01 = _mm_add_epi16(in01, kOne);
@@ -745,10 +736,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         step1_6 = SUB_EPI16(in01, in14);
         step1_7 = SUB_EPI16(in00, in15);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
-                                           &step1_2, &step1_3,
-                                           &step1_4, &step1_5,
-                                           &step1_6, &step1_7);
+        overflow =
+            check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+                                    &step1_4, &step1_5, &step1_6, &step1_7);
         if (overflow) {
           vpx_highbd_fdct16x16_c(input, output, stride);
           return;
@@ -767,8 +757,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         const __m128i q6 = SUB_EPI16(input1, input6);
         const __m128i q7 = SUB_EPI16(input0, input7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
-                                           &q4, &q5, &q6, &q7);
+        overflow =
+            check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
         if (overflow) {
           vpx_highbd_fdct16x16_c(input, output, stride);
           return;
@@ -818,12 +808,12 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           // into 32 bits.
           const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
           const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
-                                              &k__DCT_CONST_ROUNDING,
-                                              DCT_CONST_BITS);
-          const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
-                                              &k__DCT_CONST_ROUNDING,
-                                              DCT_CONST_BITS);
+          const __m128i r0 =
+              mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          const __m128i r1 =
+              mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
           overflow = check_epi16_overflow_x2(&r0, &r1);
           if (overflow) {
@@ -860,8 +850,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
               res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-              overflow = check_epi16_overflow_x4(&res02, &res14,
-                                                 &res10, &res06);
+              overflow =
+                  check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
               if (overflow) {
                 vpx_highbd_fdct16x16_c(input, output, stride);
                 return;
@@ -888,8 +878,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
-                                             &step2_4);
+          overflow =
+              check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
           if (overflow) {
             vpx_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -907,10 +897,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           step3_6 = ADD_EPI16(step1_6, step2_5);
           step3_7 = ADD_EPI16(step1_7, step2_4);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
-                                             &step3_2, &step3_3,
-                                             &step3_4, &step3_5,
-                                             &step3_6, &step3_7);
+          overflow =
+              check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
+                                      &step3_4, &step3_5, &step3_6, &step3_7);
           if (overflow) {
             vpx_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -932,8 +921,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
-                                             &step2_5);
+          overflow =
+              check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
           if (overflow) {
             vpx_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -951,10 +940,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           step1_6 = SUB_EPI16(step3_7, step2_6);
           step1_7 = ADD_EPI16(step3_7, step2_6);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
-                                             &step1_2, &step1_3,
-                                             &step1_4, &step1_5,
-                                             &step1_6, &step1_7);
+          overflow =
+              check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
+                                      &step1_4, &step1_5, &step1_6, &step1_7);
           if (overflow) {
             vpx_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -1006,16 +994,14 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         }
       }
       // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(&res00, &res01, &res02, &res03,
-                              &res04, &res05, &res06, &res07,
-                              pass, out0, out1);
-      transpose_and_output8x8(&res08, &res09, &res10, &res11,
-                              &res12, &res13, &res14, &res15,
-                              pass, out0 + 8, out1 + 8);
+      transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
+                              &res06, &res07, pass, out0, out1);
+      transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
+                              &res14, &res15, pass, out0 + 8, out1 + 8);
       if (pass == 0) {
-        out0 += 8*16;
+        out0 += 8 * 16;
       } else {
-        out1 += 8*16;
+        out1 += 8 * 16;
       }
     }
     // Setup in/out for next pass.
diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
index 3e4f49bd95262d4903e9e3a2c3d64dbca1a456fd..e14b99197f654ae84d312fd861b63e5de4a060ff 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -19,12 +19,12 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   __m128i in0, in1;
   __m128i tmp;
   const __m128i zero = _mm_setzero_si128();
-  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
-  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
-         (input +  2 * stride)));
-  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
-         (input +  3 * stride)));
+  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in1 = _mm_unpacklo_epi64(
+      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+  in0 = _mm_unpacklo_epi64(
+      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
 
   tmp = _mm_add_epi16(in0, in1);
   in0 = _mm_unpacklo_epi16(zero, tmp);
@@ -45,19 +45,19 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
 }
 
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
   __m128i u0, u1, sum;
 
   u0 = _mm_add_epi16(in0, in1);
   u1 = _mm_add_epi16(in2, in3);
 
-  in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
 
   sum = _mm_add_epi16(u0, u1);
 
@@ -65,7 +65,7 @@ void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   in2 = _mm_add_epi16(in2, in3);
   sum = _mm_add_epi16(sum, in0);
 
-  u0  = _mm_setzero_si128();
+  u0 = _mm_setzero_si128();
   sum = _mm_add_epi16(sum, in2);
 
   in0 = _mm_unpacklo_epi16(u0, sum);
@@ -92,50 +92,50 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
   int i;
 
   for (i = 0; i < 2; ++i) {
-    in0  = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
-    in1  = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
-    in2  = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
-    in3  = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
+    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
 
     u0 = _mm_add_epi16(in0, in1);
     u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
-    in1  = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
-    in2  = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
-    in3  = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
+    in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
-    in1  = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
-    in2  = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
-    in3  = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
+    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
-    in1  = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
-    in2  = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
-    in3  = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
+    in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
     sum = _mm_add_epi16(sum, u1);
     input += 8 * stride;
   }
 
-  u0  = _mm_setzero_si128();
+  u0 = _mm_setzero_si128();
   in0 = _mm_unpacklo_epi16(u0, sum);
   in1 = _mm_unpackhi_epi16(u0, sum);
   in0 = _mm_srai_epi32(in0, 16);
@@ -161,53 +161,53 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
   int i;
 
   for (i = 0; i < 8; ++i) {
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
+    in0 = _mm_load_si128((const __m128i *)(input + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 16));
+    in3 = _mm_load_si128((const __m128i *)(input + 24));
 
     input += stride;
     u0 = _mm_add_epi16(in0, in1);
     u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
+    in0 = _mm_load_si128((const __m128i *)(input + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 16));
+    in3 = _mm_load_si128((const __m128i *)(input + 24));
 
     input += stride;
     sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
+    in0 = _mm_load_si128((const __m128i *)(input + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 16));
+    in3 = _mm_load_si128((const __m128i *)(input + 24));
 
     input += stride;
     sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input +  0));
-    in1  = _mm_load_si128((const __m128i *)(input +  8));
-    in2  = _mm_load_si128((const __m128i *)(input + 16));
-    in3  = _mm_load_si128((const __m128i *)(input + 24));
+    in0 = _mm_load_si128((const __m128i *)(input + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 16));
+    in3 = _mm_load_si128((const __m128i *)(input + 24));
 
     input += stride;
     sum = _mm_add_epi16(sum, u1);
-    u0  = _mm_add_epi16(in0, in1);
-    u1  = _mm_add_epi16(in2, in3);
+    u0 = _mm_add_epi16(in0, in1);
+    u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
     sum = _mm_add_epi16(sum, u1);
   }
 
-  u0  = _mm_setzero_si128();
+  u0 = _mm_setzero_si128();
   in0 = _mm_unpacklo_epi16(u0, sum);
   in1 = _mm_unpackhi_epi16(u0, sum);
   in0 = _mm_srai_epi32(in0, 16);
@@ -230,43 +230,43 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
 #define FDCT8x8_2D vpx_fdct8x8_sse2
 #define FDCT16x16_2D vpx_fdct16x16_sse2
 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef  FDCT4x4_2D
-#undef  FDCT8x8_2D
-#undef  FDCT16x16_2D
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
 
 #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
-#undef  FDCT32x32_2D
-#undef  FDCT32x32_HIGH_PRECISION
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
 
 #define FDCT32x32_2D vpx_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef  FDCT32x32_2D
-#undef  FDCT32x32_HIGH_PRECISION
-#undef  DCT_HIGH_BIT_DEPTH
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
 #define DCT_HIGH_BIT_DEPTH 1
 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
-#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
-#undef  FDCT4x4_2D
-#undef  FDCT8x8_2D
-#undef  FDCT16x16_2D
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
+#undef FDCT4x4_2D
+#undef FDCT8x8_2D
+#undef FDCT16x16_2D
 
 #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
-#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef  FDCT32x32_2D
-#undef  FDCT32x32_HIGH_PRECISION
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
 
 #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
-#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef  FDCT32x32_2D
-#undef  FDCT32x32_HIGH_PRECISION
-#undef  DCT_HIGH_BIT_DEPTH
+#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
+#undef FDCT32x32_2D
+#undef FDCT32x32_HIGH_PRECISION
+#undef DCT_HIGH_BIT_DEPTH
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/fwd_txfm_sse2.h b/vpx_dsp/x86/fwd_txfm_sse2.h
index 94d5befbfea52b47ccd42fb0fd5a05f2dd9b1270..5201e764c83af753fe84dbe2fc5ce91856f4c4e0 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.h
+++ b/vpx_dsp/x86/fwd_txfm_sse2.h
@@ -63,99 +63,57 @@ static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
   return _mm_movemask_epi8(cmp0);
 }
 
-static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
-                                          const __m128i *preg1,
-                                          const __m128i *preg2,
-                                          const __m128i *preg3,
-                                          const __m128i *preg4,
-                                          const __m128i *preg5,
-                                          const __m128i *preg6,
-                                          const __m128i *preg7) {
+static INLINE int check_epi16_overflow_x8(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7) {
   int res0, res1;
   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
-                                           const __m128i *preg1,
-                                           const __m128i *preg2,
-                                           const __m128i *preg3,
-                                           const __m128i *preg4,
-                                           const __m128i *preg5,
-                                           const __m128i *preg6,
-                                           const __m128i *preg7,
-                                           const __m128i *preg8,
-                                           const __m128i *preg9,
-                                           const __m128i *preg10,
-                                           const __m128i *preg11) {
+static INLINE int check_epi16_overflow_x12(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
   int res0, res1;
   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  if (!res0)
-    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
-                                           const __m128i *preg1,
-                                           const __m128i *preg2,
-                                           const __m128i *preg3,
-                                           const __m128i *preg4,
-                                           const __m128i *preg5,
-                                           const __m128i *preg6,
-                                           const __m128i *preg7,
-                                           const __m128i *preg8,
-                                           const __m128i *preg9,
-                                           const __m128i *preg10,
-                                           const __m128i *preg11,
-                                           const __m128i *preg12,
-                                           const __m128i *preg13,
-                                           const __m128i *preg14,
-                                           const __m128i *preg15) {
+static INLINE int check_epi16_overflow_x16(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15) {
   int res0, res1;
   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
   if (!res0) {
     res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
-    if (!res1)
-      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
   }
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
-                                           const __m128i *preg1,
-                                           const __m128i *preg2,
-                                           const __m128i *preg3,
-                                           const __m128i *preg4,
-                                           const __m128i *preg5,
-                                           const __m128i *preg6,
-                                           const __m128i *preg7,
-                                           const __m128i *preg8,
-                                           const __m128i *preg9,
-                                           const __m128i *preg10,
-                                           const __m128i *preg11,
-                                           const __m128i *preg12,
-                                           const __m128i *preg13,
-                                           const __m128i *preg14,
-                                           const __m128i *preg15,
-                                           const __m128i *preg16,
-                                           const __m128i *preg17,
-                                           const __m128i *preg18,
-                                           const __m128i *preg19,
-                                           const __m128i *preg20,
-                                           const __m128i *preg21,
-                                           const __m128i *preg22,
-                                           const __m128i *preg23,
-                                           const __m128i *preg24,
-                                           const __m128i *preg25,
-                                           const __m128i *preg26,
-                                           const __m128i *preg27,
-                                           const __m128i *preg28,
-                                           const __m128i *preg29,
-                                           const __m128i *preg30,
-                                           const __m128i *preg31) {
+static INLINE int check_epi16_overflow_x32(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+    const __m128i *preg30, const __m128i *preg31) {
   int res0, res1;
   res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
   res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
@@ -190,36 +148,31 @@ static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
   __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
   __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
   __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
-  __m128i reg0_top_dwords = _mm_shuffle_epi32(
-      reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg1_top_dwords = _mm_shuffle_epi32(
-      reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg2_top_dwords = _mm_shuffle_epi32(
-      reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg3_top_dwords = _mm_shuffle_epi32(
-      reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg0_top_dwords =
+      _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg1_top_dwords =
+      _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg2_top_dwords =
+      _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg3_top_dwords =
+      _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
   __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
   __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
   __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
   __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
   __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
   __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
-  int overflow_01 = _mm_movemask_epi8(
-      _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
-  int overflow_23 = _mm_movemask_epi8(
-      _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+  int overflow_01 =
+      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+  int overflow_23 =
+      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
   return (overflow_01 + overflow_23);
 }
 
-static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
-                                           const __m128i *preg1,
-                                           const __m128i *preg2,
-                                           const __m128i *preg3,
-                                           const __m128i *preg4,
-                                           const __m128i *preg5,
-                                           const __m128i *preg6,
-                                           const __m128i *preg7,
-                                           const __m128i *zero) {
+static INLINE int k_check_epi32_overflow_8(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
   int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
   if (!overflow) {
     overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
@@ -227,91 +180,59 @@ static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
   return overflow;
 }
 
-static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
-                                            const __m128i *preg1,
-                                            const __m128i *preg2,
-                                            const __m128i *preg3,
-                                            const __m128i *preg4,
-                                            const __m128i *preg5,
-                                            const __m128i *preg6,
-                                            const __m128i *preg7,
-                                            const __m128i *preg8,
-                                            const __m128i *preg9,
-                                            const __m128i *preg10,
-                                            const __m128i *preg11,
-                                            const __m128i *preg12,
-                                            const __m128i *preg13,
-                                            const __m128i *preg14,
-                                            const __m128i *preg15,
-                                            const __m128i *zero) {
+static INLINE int k_check_epi32_overflow_16(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15, const __m128i *zero) {
   int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
   if (!overflow) {
     overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
     if (!overflow) {
-      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
-                                          zero);
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
       if (!overflow) {
-        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
-                                            zero);
+        overflow =
+            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
       }
     }
   }
   return overflow;
 }
 
-static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
-                                            const __m128i *preg1,
-                                            const __m128i *preg2,
-                                            const __m128i *preg3,
-                                            const __m128i *preg4,
-                                            const __m128i *preg5,
-                                            const __m128i *preg6,
-                                            const __m128i *preg7,
-                                            const __m128i *preg8,
-                                            const __m128i *preg9,
-                                            const __m128i *preg10,
-                                            const __m128i *preg11,
-                                            const __m128i *preg12,
-                                            const __m128i *preg13,
-                                            const __m128i *preg14,
-                                            const __m128i *preg15,
-                                            const __m128i *preg16,
-                                            const __m128i *preg17,
-                                            const __m128i *preg18,
-                                            const __m128i *preg19,
-                                            const __m128i *preg20,
-                                            const __m128i *preg21,
-                                            const __m128i *preg22,
-                                            const __m128i *preg23,
-                                            const __m128i *preg24,
-                                            const __m128i *preg25,
-                                            const __m128i *preg26,
-                                            const __m128i *preg27,
-                                            const __m128i *preg28,
-                                            const __m128i *preg29,
-                                            const __m128i *preg30,
-                                            const __m128i *preg31,
-                                            const __m128i *zero) {
+static INLINE int k_check_epi32_overflow_32(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+    const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
   int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
   if (!overflow) {
     overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
     if (!overflow) {
       overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
       if (!overflow) {
-        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
-                                            zero);
+        overflow =
+            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
         if (!overflow) {
-          overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
-                                              zero);
+          overflow =
+              k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
           if (!overflow) {
-            overflow = k_check_epi32_overflow_4(preg20, preg21,
-                                                preg22, preg23, zero);
+            overflow =
+                k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
             if (!overflow) {
-              overflow = k_check_epi32_overflow_4(preg24, preg25,
-                                                  preg26, preg27, zero);
+              overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
+                                                  preg27, zero);
               if (!overflow) {
-                overflow = k_check_epi32_overflow_4(preg28, preg29,
-                                                    preg30, preg31, zero);
+                overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
+                                                    preg31, zero);
               }
             }
           }
@@ -322,7 +243,7 @@ static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
   return overflow;
 }
 
-static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
 #if CONFIG_VP9_HIGHBITDEPTH
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
@@ -335,7 +256,7 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
 #if CONFIG_VP9_HIGHBITDEPTH
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
@@ -348,9 +269,7 @@ static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-
-static INLINE __m128i mult_round_shift(const __m128i *pin0,
-                                       const __m128i *pin1,
+static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
                                        const __m128i *pmultiplier,
                                        const __m128i *prounding,
                                        const int shift) {
@@ -364,12 +283,10 @@ static INLINE __m128i mult_round_shift(const __m128i *pin0,
 }
 
 static INLINE void transpose_and_output8x8(
-    const __m128i *pin00, const __m128i *pin01,
-    const __m128i *pin02, const __m128i *pin03,
-    const __m128i *pin04, const __m128i *pin05,
-    const __m128i *pin06, const __m128i *pin07,
-    const int pass, int16_t* out0_ptr,
-    tran_low_t* out1_ptr) {
+    const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
+    const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
+    const __m128i *pin06, const __m128i *pin07, const int pass,
+    int16_t *out0_ptr, tran_low_t *out1_ptr) {
   // 00 01 02 03 04 05 06 07
   // 10 11 12 13 14 15 16 17
   // 20 21 22 23 24 25 26 27
@@ -427,14 +344,14 @@ static INLINE void transpose_and_output8x8(
   // 06 16 26 36 46 56 66 76
   // 07 17 27 37 47 57 67 77
   if (pass == 0) {
-    _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
-    _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
-    _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
-    _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
-    _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
-    _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
-    _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
-    _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
+    _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
   } else {
     storeu_output(&tr2_0, (out1_ptr + 0 * 16));
     storeu_output(&tr2_1, (out1_ptr + 1 * 16));
diff --git a/vpx_dsp/x86/halfpix_variance_sse2.c b/vpx_dsp/x86/halfpix_variance_sse2.c
index 4a8fb6df7a3fd263c12b50ecf32ebf27bd4dfaca..b5c3f5fa2b1ec9d6b077400136f6ef9686b0f1ed 100644
--- a/vpx_dsp/x86/halfpix_variance_sse2.c
+++ b/vpx_dsp/x86/halfpix_variance_sse2.c
@@ -17,10 +17,8 @@
 void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
                                             int ref_stride,
                                             const unsigned char *src,
-                                            int src_stride,
-                                            unsigned int height,
-                                            int *sum,
-                                            unsigned int *sumsquared);
+                                            int src_stride, unsigned int height,
+                                            int *sum, unsigned int *sumsquared);
 void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
                                        const unsigned char *src, int src_stride,
                                        unsigned int height, int *sum,
@@ -33,8 +31,7 @@ void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride,
 uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
                                              int src_stride,
                                              const unsigned char *dst,
-                                             int dst_stride,
-                                             uint32_t *sse) {
+                                             int dst_stride, uint32_t *sse) {
   int xsum0;
   unsigned int xxsum0;
 
@@ -50,12 +47,11 @@ uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
 uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
                                              int src_stride,
                                              const unsigned char *dst,
-                                             int dst_stride,
-                                             uint32_t *sse) {
+                                             int dst_stride, uint32_t *sse) {
   int xsum0;
   unsigned int xxsum0;
-  vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16,
-                                   &xsum0, &xxsum0);
+  vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0,
+                                   &xxsum0);
 
   *sse = xxsum0;
   assert(xsum0 <= 255 * 16 * 16);
@@ -63,12 +59,10 @@ uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
   return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
 }
 
-
 uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
                                               int src_stride,
                                               const unsigned char *dst,
-                                              int dst_stride,
-                                              uint32_t *sse) {
+                                              int dst_stride, uint32_t *sse) {
   int xsum0;
   unsigned int xxsum0;
 
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 72e42adc91d7861f73a749f4efd3ee66502243f8..7d664110801bd403ebff57fb1de0e003b1f2250f 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -25,16 +25,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 
   if (bd == 8) {
     t80 = _mm_set1_epi16(0x80);
-    max = _mm_subs_epi16(
-              _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
   } else if (bd == 10) {
     t80 = _mm_set1_epi16(0x200);
-    max = _mm_subs_epi16(
-              _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
   } else {  // bd == 12
     t80 = _mm_set1_epi16(0x800);
-    max = _mm_subs_epi16(
-              _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
   }
 
   min = _mm_subs_epi16(zero, t80);
@@ -81,16 +78,16 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
     blimit = _mm_slli_epi16(
         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
     limit = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
     thresh = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
   } else {  // bd == 12
     blimit = _mm_slli_epi16(
         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
     limit = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
     thresh = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
   }
 
   q4 = _mm_load_si128((__m128i *)(s + 4 * p));
@@ -118,25 +115,22 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
   hev = _mm_subs_epu16(flat, thresh);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
-  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // abs(p1 - q1) / 2
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
   mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
-                                    _mm_subs_epu16(p0, p1)),
-                       _mm_or_si128(_mm_subs_epu16(q1, q0),
-                                    _mm_subs_epu16(q0, q1)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
+      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
   mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
-                                    _mm_subs_epu16(p1, p2)),
-                       _mm_or_si128(_mm_subs_epu16(q2, q1),
-                                    _mm_subs_epu16(q1, q2)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
   mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
-                                    _mm_subs_epu16(p2, p3)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q2),
-                                    _mm_subs_epu16(q2, q3)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
 
   mask = _mm_subs_epu16(mask, limit);
@@ -160,8 +154,8 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
   ps0 = _mm_subs_epi16(p0, t80);
   qs0 = _mm_subs_epi16(q0, t80);
 
-  filt = _mm_and_si128(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
+  filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
+                       hev);
   work_a = _mm_subs_epi16(qs0, ps0);
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
@@ -175,33 +169,27 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
   filter2 = _mm_srai_epi16(filter2, 0x3);
 
   qs0 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
-      t80);
+      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
   ps0 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
-      t80);
+      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
   filt = _mm_adds_epi16(filter1, t1);
   filt = _mm_srai_epi16(filt, 1);
   filt = _mm_andnot_si128(hev, filt);
-  qs1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
-      t80);
-  ps1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
-      t80);
+  qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+                       t80);
+  ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+                       t80);
 
   // end highbd_filter4
   // loopfilter done
 
   // highbd_flat_mask4
-  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
-                                    _mm_subs_epu16(p0, p2)),
-                       _mm_or_si128(_mm_subs_epu16(p3, p0),
-                                    _mm_subs_epu16(p0, p3)));
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
-                                    _mm_subs_epu16(q0, q2)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q0),
-                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
+      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
   flat = _mm_max_epi16(work, flat);
   work = _mm_max_epi16(abs_p1p0, abs_q1q0);
   flat = _mm_max_epi16(work, flat);
@@ -229,27 +217,23 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
 
   // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
   // but referred to as p0-p4 & q0-q4 in fn)
-  flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
-                                     _mm_subs_epu16(p0, p4)),
-                        _mm_or_si128(_mm_subs_epu16(q4, q0),
-                                     _mm_subs_epu16(q0, q4)));
-
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
-                                    _mm_subs_epu16(p0, p5)),
-                       _mm_or_si128(_mm_subs_epu16(q5, q0),
-                                    _mm_subs_epu16(q0, q5)));
+  flat2 = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
+      _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
+
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
+      _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
   flat2 = _mm_max_epi16(work, flat2);
 
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
-                                    _mm_subs_epu16(p0, p6)),
-                       _mm_or_si128(_mm_subs_epu16(q6, q0),
-                                    _mm_subs_epu16(q0, q6)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
+      _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
   flat2 = _mm_max_epi16(work, flat2);
 
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
-                                    _mm_subs_epu16(p0, p7)),
-                       _mm_or_si128(_mm_subs_epu16(q7, q0),
-                                    _mm_subs_epu16(q0, q7)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
+      _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
   flat2 = _mm_max_epi16(work, flat2);
 
   if (bd == 8)
@@ -268,29 +252,26 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
   eight = _mm_set1_epi16(8);
   four = _mm_set1_epi16(4);
 
-  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
-                                _mm_add_epi16(p4, p3));
-  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
-                                _mm_add_epi16(q4, q3));
+  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
+  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
 
   pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
   pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
 
   pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
   pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-  pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
-                                                      pixelFilter_q));
-  pixetFilter_p2p1p0 =   _mm_add_epi16(four,
-                                       _mm_add_epi16(pixetFilter_p2p1p0,
-                                                     pixetFilter_q2q1q0));
-  flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                          _mm_add_epi16(p7, p0)), 4);
-  flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                          _mm_add_epi16(q7, q0)), 4);
-  flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                         _mm_add_epi16(p3, p0)), 3);
-  flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                         _mm_add_epi16(q3, q0)), 3);
+  pixelFilter_p =
+      _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+  pixetFilter_p2p1p0 = _mm_add_epi16(
+      four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+  flat2_p0 =
+      _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
+  flat2_q0 =
+      _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
+  flat_p0 = _mm_srli_epi16(
+      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
+  flat_q0 = _mm_srli_epi16(
+      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
 
   sum_p7 = _mm_add_epi16(p7, p7);
   sum_q7 = _mm_add_epi16(q7, q7);
@@ -306,10 +287,10 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
 
   pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
   pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
-  flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                         _mm_add_epi16(sum_p3, p1)), 3);
-  flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                                         _mm_add_epi16(sum_q3, q1)), 3);
+  flat_p1 = _mm_srli_epi16(
+      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
+  flat_q1 = _mm_srli_epi16(
+      _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
 
   sum_p7 = _mm_add_epi16(sum_p7, p7);
   sum_q7 = _mm_add_epi16(sum_q7, q7);
@@ -318,53 +299,53 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
 
   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
-  flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                          _mm_add_epi16(sum_p7, p2)), 4);
-  flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                                          _mm_add_epi16(sum_q7, q2)), 4);
+  flat2_p2 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
+  flat2_q2 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
 
   pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
   pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
-  flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                         _mm_add_epi16(sum_p3, p2)), 3);
-  flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                                         _mm_add_epi16(sum_q3, q2)), 3);
+  flat_p2 = _mm_srli_epi16(
+      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
+  flat_q2 = _mm_srli_epi16(
+      _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
 
   sum_p7 = _mm_add_epi16(sum_p7, p7);
   sum_q7 = _mm_add_epi16(sum_q7, q7);
   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
-  flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                          _mm_add_epi16(sum_p7, p3)), 4);
-  flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                                          _mm_add_epi16(sum_q7, q3)), 4);
+  flat2_p3 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
+  flat2_q3 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
 
   sum_p7 = _mm_add_epi16(sum_p7, p7);
   sum_q7 = _mm_add_epi16(sum_q7, q7);
   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
-  flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                          _mm_add_epi16(sum_p7, p4)), 4);
-  flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                                          _mm_add_epi16(sum_q7, q4)), 4);
+  flat2_p4 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
+  flat2_q4 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
 
   sum_p7 = _mm_add_epi16(sum_p7, p7);
   sum_q7 = _mm_add_epi16(sum_q7, q7);
   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
-  flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                          _mm_add_epi16(sum_p7, p5)), 4);
-  flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                                          _mm_add_epi16(sum_q7, q5)), 4);
+  flat2_p5 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
+  flat2_q5 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
 
   sum_p7 = _mm_add_epi16(sum_p7, p7);
   sum_q7 = _mm_add_epi16(sum_q7, q7);
   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
-  flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                          _mm_add_epi16(sum_p7, p6)), 4);
-  flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                                          _mm_add_epi16(sum_q7, q6)), 4);
+  flat2_p6 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
+  flat2_q6 = _mm_srli_epi16(
+      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
 
   //  wide flat
   //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -505,8 +486,7 @@ void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
 void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
-                                      const uint8_t *_thresh,
-                                      int bd) {
+                                      const uint8_t *_thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
@@ -546,19 +526,19 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
     t80 = _mm_set1_epi16(0x80);
   } else if (bd == 10) {
     blimit = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
     limit = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
     thresh = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
     t80 = _mm_set1_epi16(0x200);
   } else {  // bd == 12
     blimit = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
     limit = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
     thresh = _mm_slli_epi16(
-          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
     t80 = _mm_set1_epi16(0x800);
   }
 
@@ -568,20 +548,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   qs1 = _mm_subs_epi16(q1, t80);
 
   // filter_mask and hev_mask
-  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
-                          _mm_subs_epu16(p0, p1));
-  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
-                          _mm_subs_epu16(q0, q1));
-
-  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
-                          _mm_subs_epu16(q0, p0));
-  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
-                          _mm_subs_epu16(q1, p1));
+  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+
+  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
   hev = _mm_subs_epu16(flat, thresh);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
-  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
@@ -593,28 +569,24 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   mask = _mm_max_epi16(abs_q1q0, mask);
   // mask |= (abs(q1 - q0) > limit) * -1;
 
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
-                                    _mm_subs_epu16(p1, p2)),
-                       _mm_or_si128(_mm_subs_epu16(q2, q1),
-                                    _mm_subs_epu16(q1, q2)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
   mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
-                                    _mm_subs_epu16(p2, p3)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q2),
-                                    _mm_subs_epu16(q2, q3)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
+      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
   mask = _mm_subs_epu16(mask, limit);
   mask = _mm_cmpeq_epi16(mask, zero);
 
   // flat_mask4
-  flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
-                                    _mm_subs_epu16(p0, p2)),
-                       _mm_or_si128(_mm_subs_epu16(q2, q0),
-                                    _mm_subs_epu16(q0, q2)));
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
-                                    _mm_subs_epu16(p0, p3)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q0),
-                                    _mm_subs_epu16(q0, q3)));
+  flat = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
+      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
+      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
   flat = _mm_max_epi16(work, flat);
   flat = _mm_max_epi16(abs_p1p0, flat);
   flat = _mm_max_epi16(abs_q1q0, flat);
@@ -737,14 +709,10 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   _mm_store_si128((__m128i *)(s + 2 * p), q2);
 }
 
-void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
-                                           const uint8_t *_blimit0,
-                                           const uint8_t *_limit0,
-                                           const uint8_t *_thresh0,
-                                           const uint8_t *_blimit1,
-                                           const uint8_t *_limit1,
-                                           const uint8_t *_thresh1,
-                                           int bd) {
+void vpx_highbd_lpf_horizontal_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
   vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
   vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
 }
@@ -752,8 +720,7 @@ void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
 void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
-                                      const uint8_t *_thresh,
-                                      int bd) {
+                                      const uint8_t *_thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
@@ -765,16 +732,16 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
   __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
-                                        _mm_subs_epu16(p0, p1));
-  const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
-                                        _mm_subs_epu16(q0, q1));
+  const __m128i abs_p1p0 =
+      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
+  const __m128i abs_q1q0 =
+      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
   const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
   const __m128i one = _mm_set1_epi16(1);
-  __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
-                                  _mm_subs_epu16(q0, p0));
-  __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
-                                  _mm_subs_epu16(q1, p1));
+  __m128i abs_p0q0 =
+      _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
+  __m128i abs_p1q1 =
+      _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
   __m128i work;
   const __m128i t4 = _mm_set1_epi16(4);
   const __m128i t3 = _mm_set1_epi16(3);
@@ -838,7 +805,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   hev = _mm_subs_epu16(flat, thresh);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
-  abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
@@ -848,15 +815,13 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   mask = _mm_max_epi16(flat, mask);
   // mask |= (abs(p1 - p0) > limit) * -1;
   // mask |= (abs(q1 - q0) > limit) * -1;
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
-                                    _mm_subs_epu16(p1, p2)),
-                       _mm_or_si128(_mm_subs_epu16(p3, p2),
-                                    _mm_subs_epu16(p2, p3)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
+      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
   mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
-                                    _mm_subs_epu16(q1, q2)),
-                       _mm_or_si128(_mm_subs_epu16(q3, q2),
-                                    _mm_subs_epu16(q2, q3)));
+  work = _mm_max_epi16(
+      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
+      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
   mask = _mm_subs_epu16(mask, limit);
   mask = _mm_cmpeq_epi16(mask, zero);
@@ -878,8 +843,8 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   // Filter1 >> 3
   work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
   filter1 = _mm_srli_epi16(filter1, 3);
-  work_a = _mm_and_si128(work_a, tffe0);  // sign bits for the values < 0
-  filter1 = _mm_and_si128(filter1, t1f);  // clamp the range
+  work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
+  filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
   filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
 
   // Filter2 >> 3
@@ -901,12 +866,12 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
 
   q0 = _mm_adds_epi16(
       signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
-  q1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
+  q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
+                      t80);
   p0 = _mm_adds_epi16(
       signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
-  p1 = _mm_adds_epi16(
-      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
+  p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
+                      t80);
 
   _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
   _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
@@ -914,35 +879,38 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 }
 
-void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
-                                           const uint8_t *_blimit0,
-                                           const uint8_t *_limit0,
-                                           const uint8_t *_thresh0,
-                                           const uint8_t *_blimit1,
-                                           const uint8_t *_limit1,
-                                           const uint8_t *_thresh1,
-                                           int bd) {
+void vpx_highbd_lpf_horizontal_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
   vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
   vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
 }
 
-static INLINE void highbd_transpose(uint16_t *src[], int in_p,
-                                    uint16_t *dst[], int out_p,
-                                    int num_8x8_to_transpose) {
+static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
+                                    int out_p, int num_8x8_to_transpose) {
   int idx8x8 = 0;
   __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
   do {
     uint16_t *in = src[idx8x8];
     uint16_t *out = dst[idx8x8];
 
-    p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
-    p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
-    p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
-    p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
-    p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    p0 =
+        _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
+    p1 =
+        _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
+    p2 =
+        _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
+    p3 =
+        _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
+    p4 =
+        _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
+    p5 =
+        _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
+    p6 =
+        _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
+    p7 =
+        _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
     // 00 10 01 11 02 12 03 13
     x0 = _mm_unpacklo_epi16(p0, p1);
     // 20 30 21 31 22 32 23 33
@@ -960,9 +928,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
     // 01 11 21 31 41 51 61 71
     x7 = _mm_unpackhi_epi64(x4, x5);
 
-    _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
+    _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
     // 00 10 20 30 40 50 60 70
-    _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
+    _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
     // 01 11 21 31 41 51 61 71
 
     // 02 12 22 32 03 13 23 33
@@ -974,9 +942,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
     // 03 13 23 33 43 53 63 73
     x7 = _mm_unpackhi_epi64(x4, x5);
 
-    _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
+    _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
     // 02 12 22 32 42 52 62 72
-    _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
+    _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
     // 03 13 23 33 43 53 63 73
 
     // 04 14 05 15 06 16 07 17
@@ -996,9 +964,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
     // 05 15 25 35 45 55 65 75
     x7 = _mm_unpackhi_epi64(x4, x5);
 
-    _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
+    _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
     // 04 14 24 34 44 54 64 74
-    _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
+    _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
     // 05 15 25 35 45 55 65 75
 
     // 06 16 26 36 07 17 27 37
@@ -1010,15 +978,15 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p,
     // 07 17 27 37 47 57 67 77
     x7 = _mm_unpackhi_epi64(x4, x5);
 
-    _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
+    _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
     // 06 16 26 36 46 56 66 76
-    _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
+    _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
     // 07 17 27 37 47 57 67 77
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
-                                        int in_p, uint16_t *out, int out_p) {
+static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
+                                        uint16_t *out, int out_p) {
   uint16_t *src0[1];
   uint16_t *src1[1];
   uint16_t *dest0[1];
@@ -1031,10 +999,8 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
   highbd_transpose(src1, in_p, dest1, out_p, 1);
 }
 
-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
-                                    const uint8_t *blimit,
-                                    const uint8_t *limit,
-                                    const uint8_t *thresh,
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
@@ -1056,14 +1022,10 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
-                                         const uint8_t *blimit0,
-                                         const uint8_t *limit0,
-                                         const uint8_t *thresh0,
-                                         const uint8_t *blimit1,
-                                         const uint8_t *limit1,
-                                         const uint8_t *thresh1,
-                                         int bd) {
+void vpx_highbd_lpf_vertical_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
   uint16_t *src[2];
   uint16_t *dst[2];
@@ -1083,10 +1045,8 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
-                                    const uint8_t *blimit,
-                                    const uint8_t *limit,
-                                    const uint8_t *thresh,
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
@@ -1108,14 +1068,10 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
-                                         const uint8_t *blimit0,
-                                         const uint8_t *limit0,
-                                         const uint8_t *thresh0,
-                                         const uint8_t *blimit1,
-                                         const uint8_t *limit1,
-                                         const uint8_t *thresh1,
-                                         int bd) {
+void vpx_highbd_lpf_vertical_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
   uint16_t *src[2];
   uint16_t *dst[2];
@@ -1136,11 +1092,9 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
-                                     const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                      const uint8_t *limit,
-                                     const uint8_t *thresh,
-                                     int bd) {
+                                     const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
   uint16_t *src[2];
   uint16_t *dst[2];
@@ -1154,8 +1108,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
   highbd_transpose(src, p, dst, 8, 2);
 
   // Loop filtering
-  vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit,
-                                        thresh, bd);
+  vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
+                                        bd);
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
   dst[0] = s - 8;
@@ -1165,12 +1119,10 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
   highbd_transpose(src, 8, dst, p, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
-                                          int p,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
-                                          const uint8_t *thresh,
-                                          int bd) {
+                                          const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
 
   //  Transpose 16x16
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 164ffcff2bd7fdac2ba80305d59c2449b8b09cc2..dad00dfe97162abc877ff149f2ab176cd40a69f4 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -16,26 +16,19 @@
 #include "vpx_ports/mem.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
-                                intptr_t count,
-                                int skip_block,
-                                const int16_t *zbin_ptr,
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+                                int skip_block, const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr,
-                                tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr,
-                                uint16_t *eob_ptr,
-                                const int16_t *scan,
-                                const int16_t *iscan) {
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
   int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
   __m128i zbins[2];
   __m128i nzbins[2];
 
-  zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
-                           (int)zbin_ptr[1],
-                           (int)zbin_ptr[1],
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
                            (int)zbin_ptr[0]);
   zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
 
@@ -74,14 +67,13 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
 
       coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
       coeffs_sign = _mm_srai_epi32(coeffs, 31);
-      coeffs = _mm_sub_epi32(
-            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
       tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
       tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
       tmp1 = _mm_or_si128(tmp1, tmp2);
       test = _mm_movemask_epi8(tmp1);
-      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
-      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
 
       for (j = 0; j < 4; j++) {
         if (test & (1 << (4 * j))) {
@@ -92,8 +84,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
               (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
           qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
           dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (abs_qcoeff)
-            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
         }
       }
     }
@@ -101,20 +92,12 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
   *eob_ptr = eob_i + 1;
 }
 
-
-void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
-                                      intptr_t n_coeffs,
-                                      int skip_block,
-                                      const int16_t *zbin_ptr,
-                                      const int16_t *round_ptr,
-                                      const int16_t *quant_ptr,
-                                      const int16_t *quant_shift_ptr,
-                                      tran_low_t *qcoeff_ptr,
-                                      tran_low_t *dqcoeff_ptr,
-                                      const int16_t *dequant_ptr,
-                                      uint16_t *eob_ptr,
-                                      const int16_t *scan,
-                                      const int16_t *iscan) {
+void vpx_highbd_quantize_b_32x32_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
@@ -123,10 +106,7 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
-  zbins[0] = _mm_set_epi32(zbin1_tmp,
-                           zbin1_tmp,
-                           zbin1_tmp,
-                           zbin0_tmp);
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
 
   nzbins[0] = _mm_setzero_si128();
@@ -147,14 +127,10 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
       cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
       cmp1 = _mm_and_si128(cmp1, cmp2);
       test = _mm_movemask_epi8(cmp1);
-      if (!(test & 0xf))
-        idx_arr[idx++] = i * 4;
-      if (!(test & 0xf0))
-        idx_arr[idx++] = i * 4 + 1;
-      if (!(test & 0xf00))
-        idx_arr[idx++] = i * 4 + 2;
-      if (!(test & 0xf000))
-        idx_arr[idx++] = i * 4 + 3;
+      if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
     }
 
     // Quantization pass: only process the coefficients selected in
@@ -164,15 +140,14 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 = abs_coeff
-                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
       const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
       const uint32_t abs_qcoeff =
           (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
       qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff)
-        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
     }
   }
   *eob_ptr = eob + 1;
diff --git a/vpx_dsp/x86/highbd_subtract_sse2.c b/vpx_dsp/x86/highbd_subtract_sse2.c
index 33e464b7842d16b331bf977b66f52e8fb5b6a4fd..e7d5ac2982f23bd7f039b66bab74b76c82e2489f 100644
--- a/vpx_dsp/x86/highbd_subtract_sse2.c
+++ b/vpx_dsp/x86/highbd_subtract_sse2.c
@@ -15,10 +15,10 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
-typedef void (*SubtractWxHFuncType)(
-    int16_t *diff, ptrdiff_t diff_stride,
-    const uint16_t *src, ptrdiff_t src_stride,
-    const uint16_t *pred, ptrdiff_t pred_stride);
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+                                    const uint16_t *src, ptrdiff_t src_stride,
+                                    const uint16_t *pred,
+                                    ptrdiff_t pred_stride);
 
 static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
                          const uint16_t *src, ptrdiff_t src_stride,
@@ -26,17 +26,17 @@ static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
   __m128i u0, u1, u2, u3;
   __m128i v0, v1, v2, v3;
   __m128i x0, x1, x2, x3;
-  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
 
-  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
 
-  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
 
   x0 = _mm_sub_epi16(u0, v0);
   x1 = _mm_sub_epi16(u1, v1);
@@ -44,11 +44,11 @@ static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
   x3 = _mm_sub_epi16(u3, v3);
 
   _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x3);
 }
 
@@ -58,25 +58,25 @@ static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
-
-  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
 
   x0 = _mm_sub_epi16(u0, v0);
   x1 = _mm_sub_epi16(u1, v1);
@@ -88,19 +88,19 @@ static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
   x7 = _mm_sub_epi16(u7, v7);
 
   _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x3);
-  store_diff = (int64_t *) (diff + 4 * diff_stride);
+  store_diff = (int64_t *)(diff + 4 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x4);
-  store_diff = (int64_t *) (diff + 5 * diff_stride);
+  store_diff = (int64_t *)(diff + 5 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x5);
-  store_diff = (int64_t *) (diff + 6 * diff_stride);
+  store_diff = (int64_t *)(diff + 6 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x6);
-  store_diff = (int64_t *) (diff + 7 * diff_stride);
+  store_diff = (int64_t *)(diff + 7 * diff_stride);
   _mm_storel_epi64((__m128i *)store_diff, x7);
 }
 
@@ -111,25 +111,25 @@ static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
   __m128i v0, v1, v2, v3;
   __m128i x0, x1, x2, x3;
 
-  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
 
-  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
 
   x0 = _mm_sub_epi16(u0, v0);
   x1 = _mm_sub_epi16(u1, v1);
   x2 = _mm_sub_epi16(u2, v2);
   x3 = _mm_sub_epi16(u3, v3);
 
-  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
 }
 
 static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
@@ -139,23 +139,23 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
 
-  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
 
   x0 = _mm_sub_epi16(u0, v0);
   x1 = _mm_sub_epi16(u1, v1);
@@ -166,14 +166,14 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
   x6 = _mm_sub_epi16(u6, v6);
   x7 = _mm_sub_epi16(u7, v7);
 
-  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
-  _mm_storeu_si128((__m128i *) (diff + 4 * diff_stride), x4);
-  _mm_storeu_si128((__m128i *) (diff + 5 * diff_stride), x5);
-  _mm_storeu_si128((__m128i *) (diff + 6 * diff_stride), x6);
-  _mm_storeu_si128((__m128i *) (diff + 7 * diff_stride), x7);
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
 }
 
 static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
@@ -349,17 +349,14 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
   return ret_func_ptr;
 }
 
-void vpx_highbd_subtract_block_sse2(
-    int rows, int cols,
-    int16_t *diff, ptrdiff_t diff_stride,
-    const uint8_t *src8, ptrdiff_t src_stride,
-    const uint8_t *pred8,
-    ptrdiff_t pred_stride,
-    int bd) {
+void vpx_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+                                    ptrdiff_t diff_stride, const uint8_t *src8,
+                                    ptrdiff_t src_stride, const uint8_t *pred8,
+                                    ptrdiff_t pred_stride, int bd) {
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   SubtractWxHFuncType func;
-  (void) bd;
+  (void)bd;
 
   func = getSubtractFunc(rows, cols);
   func(diff, diff_stride, src, src_stride, pred, pred_stride);
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 364391578592582b06efd67e4183dfe29a7c1766..76e8816db9686c21aa4ac9883a2461e0b4acf160 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -15,9 +15,9 @@
 
 #include "vpx_ports/mem.h"
 
-typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
-                                        const uint16_t *ref, int ref_stride,
-                                        uint32_t *sse, int *sum);
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                       const uint16_t *ref, int ref_stride,
+                                       uint32_t *sse, int *sum);
 
 uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
                                     const uint16_t *ref, int ref_stride,
@@ -28,8 +28,8 @@ uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
                                       uint32_t *sse, int *sum);
 
 static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
-                                   const uint16_t *ref, int ref_stride,
-                                   int w, int h, uint32_t *sse, int *sum,
+                                   const uint16_t *ref, int ref_stride, int w,
+                                   int h, uint32_t *sse, int *sum,
                                    high_variance_fn_t var_fn, int block_size) {
   int i, j;
 
@@ -40,8 +40,8 @@ static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
     for (j = 0; j < w; j += block_size) {
       unsigned int sse0;
       int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
       *sse += sse0;
       *sum += sum0;
     }
@@ -49,8 +49,8 @@ static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
 }
 
 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    int w, int h, uint32_t *sse, int *sum,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
                                     high_variance_fn_t var_fn, int block_size) {
   int i, j;
   uint64_t sse_long = 0;
@@ -60,8 +60,8 @@ static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
     for (j = 0; j < w; j += block_size) {
       unsigned int sse0;
       int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
       sse_long += sse0;
       sum_long += sum0;
     }
@@ -71,8 +71,8 @@ static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
 }
 
 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    int w, int h, uint32_t *sse, int *sum,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
                                     high_variance_fn_t var_fn, int block_size) {
   int i, j;
   uint64_t sse_long = 0;
@@ -82,8 +82,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
     for (j = 0; j < w; j += block_size) {
       unsigned int sse0;
       int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
       sse_long += sse0;
       sum_long += sum0;
     }
@@ -92,84 +92,83 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
 }
 
-
-#define HIGH_GET_VAR(S) \
-void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                       const uint8_t *ref8, int ref_stride, \
-                                       uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-} \
-\
-void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                          const uint8_t *ref8, int ref_stride, \
-                                          uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
-  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
-} \
-\
-void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                          const uint8_t *ref8, int ref_stride, \
-                                          uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
-  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
-}
+#define HIGH_GET_VAR(S)                                                       \
+  void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                         const uint8_t *ref8, int ref_stride, \
+                                         uint32_t *sse, int *sum) {           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+  }                                                                           \
+                                                                              \
+  void vpx_highbd_10_get##S##x##S##var_sse2(                                  \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
+  }                                                                           \
+                                                                              \
+  void vpx_highbd_12_get##S##x##S##var_sse2(                                  \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
+  }
 
 HIGH_GET_VAR(16);
 HIGH_GET_VAR(8);
 
 #undef HIGH_GET_VAR
 
-#define VAR_FN(w, h, block_size, shift) \
-uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-                         vpx_highbd_calc##block_size##x##block_size##var_sse2, \
-                         block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  int64_t var; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_10_variance_sse2( \
-      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
-  return (var >= 0) ? (uint32_t)var : 0; \
-} \
-\
-uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  int64_t var; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_12_variance_sse2( \
-      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
-  return (var >= 0) ? (uint32_t)var : 0; \
-}
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t vpx_highbd_8_variance##w##x##h##_sse2(                          \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_8_variance_sse2(                                                \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    return *sse - (((int64_t)sum * sum) >> shift);                         \
+  }                                                                        \
+                                                                           \
+  uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  uint32_t vpx_highbd_12_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_12_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
 
 VAR_FN(64, 64, 16, 12);
 VAR_FN(64, 32, 16, 11);
@@ -185,13 +184,13 @@ VAR_FN(8, 8, 8, 6);
 #undef VAR_FN
 
 unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                      const uint8_t *ref8, int ref_stride,
-                                      unsigned int *sse) {
+                                        const uint8_t *ref8, int ref_stride,
+                                        unsigned int *sse) {
   int sum;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                         sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                         vpx_highbd_calc16x16var_sse2, 16);
   return *sse;
 }
 
@@ -201,8 +200,8 @@ unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
   int sum;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          vpx_highbd_calc16x16var_sse2, 16);
   return *sse;
 }
 
@@ -212,19 +211,19 @@ unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
   int sum;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          vpx_highbd_calc16x16var_sse2, 16);
   return *sse;
 }
 
 unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                    const uint8_t *ref8, int ref_stride,
-                                    unsigned int *sse) {
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
   int sum;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                         sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                         vpx_highbd_calc8x8var_sse2, 8);
   return *sse;
 }
 
@@ -234,8 +233,8 @@ unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
   int sum;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          vpx_highbd_calc8x8var_sse2, 8);
   return *sse;
 }
 
@@ -245,25 +244,21 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
   int sum;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          vpx_highbd_calc8x8var_sse2, 8);
   return *sse;
 }
 
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in
 // highbd_subpel_variance_impl_sse2.asm
-#define DECL(w, opt) \
-  int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
-                                                 ptrdiff_t src_stride, \
-                                                 int x_offset, int y_offset, \
-                                                 const uint16_t *dst, \
-                                                 ptrdiff_t dst_stride, \
-                                                 int height, \
-                                                 unsigned int *sse, \
-                                                 void *unused0, void *unused);
+#define DECL(w, opt)                                                         \
+  int vpx_highbd_sub_pixel_variance##w##xh_##opt(                            \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
+      unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
-  DECL(8, opt); \
+  DECL(8, opt);    \
   DECL(16, opt)
 
 DECLS(sse2);
@@ -271,152 +266,134 @@ DECLS(sse2);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
-                                                          int src_stride, \
-                                                          int x_offset, \
-                                                          int y_offset, \
-                                                          const uint8_t *dst8, \
-                                                          int dst_stride, \
-                                                          uint32_t *sse_ptr) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst, dst_stride, h, \
-                                                       &sse, NULL, NULL); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
-                                                          src_stride, \
-                                                          x_offset, y_offset, \
-                                                          dst + 16, \
-                                                          dst_stride, \
-                                                          h, &sse2, \
-                                                          NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 32, dst_stride, \
-                                                        h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
-          src + 48, src_stride, x_offset, y_offset, \
-          dst + 48, dst_stride, h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst, dst_stride, \
-                                                       h, &sse, NULL, NULL); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
-                                                          src_stride, \
-                                                          x_offset, y_offset, \
-                                                          dst + 16, \
-                                                          dst_stride, \
-                                                          h, &sse2, \
-                                                          NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 32, dst_stride, \
-                                                        h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 48, dst_stride, \
-                                                        h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 2); \
-  sse = ROUND_POWER_OF_TWO(sse, 4); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
-  int start_row; \
-  uint32_t sse; \
-  int se = 0; \
-  uint64_t long_sse = 0; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  for (start_row = 0; start_row < h; start_row +=16) { \
-    uint32_t sse2; \
-    int height = h - start_row < 16 ? h - start_row : 16; \
-    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
-        src + (start_row * src_stride), src_stride, \
-        x_offset, y_offset, dst + (start_row * dst_stride), \
-        dst_stride, height, &sse2, NULL, NULL); \
-    se += se2; \
-    long_sse += sse2; \
-    if (w > wf) { \
-      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
-          src + 16 + (start_row * src_stride), src_stride, \
-          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
-          dst_stride, height, &sse2, NULL, NULL); \
-      se += se2; \
-      long_sse += sse2; \
-      if (w > wf * 2) { \
-        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
-            src + 32 + (start_row * src_stride), src_stride, \
-            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
-            dst_stride, height, &sse2, NULL, NULL); \
-        se += se2; \
-        long_sse += sse2; \
-        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
-            src + 48 + (start_row * src_stride), src_stride, \
-            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
-            dst_stride, height, &sse2, NULL, NULL); \
-        se += se2; \
-        long_sse += sse2; \
-      }\
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt) \
-FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt, (int64_t)); \
-FN(8, 16, 8, 3, 4, opt, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt, (int64_t));
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        NULL);                                                                 \
+    if (w > wf) {                                                              \
+      unsigned int sse2;                                                       \
+      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          &sse2, NULL, NULL);                                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        NULL);                                                                 \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          &sse2, NULL, NULL);                                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int start_row;                                                             \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
+          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
+          NULL);                                                               \
+      se += se2;                                                               \
+      long_sse += sse2;                                                        \
+      if (w > wf) {                                                            \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              height, &sse2, NULL, NULL);                                      \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              height, &sse2, NULL, NULL);                                      \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+  }
 
+#define FNS(opt)                        \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));
 
 FNS(sse2);
 
@@ -424,183 +401,162 @@ FNS(sse2);
 #undef FN
 
 // The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
-                                                   ptrdiff_t src_stride, \
-                                                   int x_offset, int y_offset, \
-                                                   const uint16_t *dst, \
-                                                   ptrdiff_t dst_stride, \
-                                                   const uint16_t *sec, \
-                                                   ptrdiff_t sec_stride, \
-                                                   int height, \
-                                                   unsigned int *sse, \
-                                                   void *unused0, void *unused);
+#define DECL(w, opt)                                                         \
+  int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
+      void *unused);
 #define DECLS(opt1) \
-DECL(16, opt1) \
-DECL(8, opt1)
+  DECL(16, opt1)    \
+  DECL(8, opt1)
 
 DECLS(sse2);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-               src, src_stride, x_offset, \
-               y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                  src + 16, src_stride, x_offset, y_offset, \
-                  dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 32, src_stride, x_offset, y_offset, \
-                dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 48, src_stride, x_offset, y_offset, \
-                dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src, src_stride, x_offset, \
-                                            y_offset, dst, dst_stride, \
-                                            sec, w, h, &sse, NULL, NULL); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 16, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 16, dst_stride, \
-                                            sec + 16, w, h, &sse2, \
-                                            NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 32, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 32, dst_stride, \
-                                            sec + 32, w, h, &sse2, \
-                                            NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 48, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 48, dst_stride, \
-                                            sec + 48, w, h, &sse2, \
-                                            NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 2); \
-  sse = ROUND_POWER_OF_TWO(sse, 4); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  int start_row; \
-  uint32_t sse; \
-  int se = 0; \
-  uint64_t long_sse = 0; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  for (start_row = 0; start_row < h; start_row +=16) { \
-    uint32_t sse2; \
-    int height = h - start_row < 16 ? h - start_row : 16; \
-    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + (start_row * src_stride), src_stride, x_offset, \
-                y_offset, dst + (start_row * dst_stride), dst_stride, \
-                sec + (start_row * w), w, height, &sse2, NULL, NULL); \
-    se += se2; \
-    long_sse += sse2; \
-    if (w > wf) { \
-      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 16 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 16 + (start_row * dst_stride), dst_stride, \
-                sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
-      se += se2; \
-      long_sse += sse2; \
-      if (w > wf * 2) { \
-        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 32 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 32 + (start_row * dst_stride), dst_stride, \
-                sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
-        se += se2; \
-        long_sse += sse2; \
-        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 48 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 48 + (start_row * dst_stride), dst_stride, \
-                sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
-        se += se2; \
-        long_sse += sse2; \
-      } \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            sec + 32, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            sec + 48, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            sec + 32, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            sec + 48, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    int start_row;                                                             \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
+          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
+          w, height, &sse2, NULL, NULL);                                       \
+      se += se2;                                                               \
+      long_sse += sse2;                                                        \
+      if (w > wf) {                                                            \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
+            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+  }
 
-#define FNS(opt1) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
+#define FNS(opt1)                        \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
+  FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
+  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
+  FN(8, 4, 8, 3, 2, opt1, (int64_t));
 
 FNS(sse2);
 
 #undef FNS
 #undef FN
 
-void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred,
-                                    int width, int height,
-                                    const uint8_t *ref8,
-                                    int ref_stride) {
+void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
+                                    const uint8_t *ref8, int ref_stride) {
   int i, j;
   int stride = ref_stride << 3;
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -608,7 +564,7 @@ void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred,
   if (width >= 8) {
     // read 8 points at one time
     for (i = 0; i < height; i++) {
-      for (j = 0; j < width; j+= 8) {
+      for (j = 0; j < width; j += 8) {
         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
@@ -629,14 +585,14 @@ void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred,
 
         _mm_storeu_si128((__m128i *)(comp_pred), t0);
         comp_pred += 8;
-        ref += 64;                            // 8 * 8;
+        ref += 64;  // 8 * 8;
       }
       ref += stride - (width << 3);
     }
   } else {
     // read 4 points at one time
     for (i = 0; i < height; i++) {
-      for (j = 0; j < width; j+= 4) {
+      for (j = 0; j < width; j += 4) {
         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
@@ -657,9 +613,8 @@ void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred,
 }
 
 void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
-                                             const uint8_t *pred8,
-                                             int width, int height,
-                                             const uint8_t *ref8,
+                                             const uint8_t *pred8, int width,
+                                             int height, const uint8_t *ref8,
                                              int ref_stride) {
   const __m128i one = _mm_set1_epi16(1);
   int i, j;
@@ -670,7 +625,7 @@ void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
   if (width >= 8) {
     // read 8 points at one time
     for (i = 0; i < height; i++) {
-      for (j = 0; j < width; j+= 8) {
+      for (j = 0; j < width; j += 8) {
         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
@@ -704,7 +659,7 @@ void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
   } else {
     // read 4 points at one time
     for (i = 0; i < height; i++) {
-      for (j = 0; j < width; j+= 4) {
+      for (j = 0; j < width; j += 4) {
         __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
         __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
         __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c
index 4d0b75deab782f90b3efc9d27b7e0e88568a6aa1..d1d2146b473ba2f4e765d293525e95fe1dd5450a 100644
--- a/vpx_dsp/x86/highbd_variance_sse4.c
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -65,10 +65,8 @@ static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
   *sum = (int64_t)_mm_extract_epi32(y0, 0);
 }
 
-uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
-                                         int a_stride,
-                                         const uint8_t *b,
-                                         int b_stride,
+uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
                                          uint32_t *sse) {
   int64_t sum;
   uint64_t local_sse;
@@ -79,10 +77,8 @@ uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
   return *sse - (uint32_t)((sum * sum) >> 4);
 }
 
-uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
-                                          int a_stride,
-                                          const uint8_t *b,
-                                          int b_stride,
+uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
                                           uint32_t *sse) {
   int64_t sum;
   uint64_t local_sse;
@@ -94,10 +90,8 @@ uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
   return *sse - (uint32_t)((sum * sum) >> 4);
 }
 
-uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
-                                          int a_stride,
-                                          const uint8_t *b,
-                                          int b_stride,
+uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
                                           uint32_t *sse) {
   int64_t sum;
   uint64_t local_sse;
@@ -111,136 +105,108 @@ uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
 
 // Sub-pixel
 uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int  src_stride,
-    int xoffset, int  yoffset,
-    const uint8_t *dst, int dst_stride,
-    uint32_t *sse) {
-
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
 
   vpx_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1,
-      4, bilinear_filters_2t[xoffset]);
-  vpx_highbd_var_filter_block2d_bil_second_pass(
-      fdata3, temp2, 4, 4, 4, 4,
-      bilinear_filters_2t[yoffset]);
-
-  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
-                                  4, dst, dst_stride, sse);
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+                                  sse);
 }
 
 uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int  src_stride,
-    int xoffset, int  yoffset,
-    const uint8_t *dst, int dst_stride,
-    uint32_t *sse) {
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
 
   vpx_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1,
-      4, bilinear_filters_2t[xoffset]);
-  vpx_highbd_var_filter_block2d_bil_second_pass(
-      fdata3, temp2, 4, 4, 4, 4,
-      bilinear_filters_2t[yoffset]);
-
-  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
-                                   4, dst, dst_stride, sse);
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
 }
 
 uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int  src_stride,
-    int xoffset, int  yoffset,
-    const uint8_t *dst, int dst_stride,
-    uint32_t *sse) {
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
 
   vpx_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1,
-      4, bilinear_filters_2t[xoffset]);
-  vpx_highbd_var_filter_block2d_bil_second_pass(
-      fdata3, temp2, 4, 4, 4, 4,
-      bilinear_filters_2t[yoffset]);
-
-  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
-                                   4, dst, dst_stride, sse);
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
 }
 
 // Sub-pixel average
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int  src_stride,
-    int xoffset, int  yoffset,
-    const uint8_t *dst, int dst_stride,
-    uint32_t *sse,
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
     const uint8_t *second_pred) {
-
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
 
   vpx_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1,
-      4, bilinear_filters_2t[xoffset]);
-  vpx_highbd_var_filter_block2d_bil_second_pass(
-      fdata3, temp2, 4, 4, 4, 4,
-      bilinear_filters_2t[yoffset]);
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
 
-  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+                           4);
 
-  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
-                                  4, dst, dst_stride, sse);
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+                                  sse);
 }
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int  src_stride,
-    int xoffset, int  yoffset,
-    const uint8_t *dst, int dst_stride,
-    uint32_t *sse,
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
     const uint8_t *second_pred) {
-
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
 
   vpx_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1,
-      4, bilinear_filters_2t[xoffset]);
-  vpx_highbd_var_filter_block2d_bil_second_pass(
-      fdata3, temp2, 4, 4, 4, 4,
-      bilinear_filters_2t[yoffset]);
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
 
-  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+                           4);
 
-  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
-                                   4, dst, dst_stride, sse);
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
 }
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int  src_stride,
-    int xoffset, int  yoffset,
-    const uint8_t *dst, int dst_stride,
-    uint32_t *sse,
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
     const uint8_t *second_pred) {
-
   uint16_t fdata3[(4 + 1) * 4];
   uint16_t temp2[4 * 4];
   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
 
   vpx_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1,
-      4, bilinear_filters_2t[xoffset]);
-  vpx_highbd_var_filter_block2d_bil_second_pass(
-      fdata3, temp2, 4, 4, 4, 4,
-      bilinear_filters_2t[yoffset]);
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
 
-  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+                           4);
 
-  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
-                                   4, dst, dst_stride, sse);
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
 }
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index df5068c624b483f8d148b356e0b25f6aabdd17b2..a6fc1161f120b99db58b6331ff2660bd9d5d02cb 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -12,14 +12,14 @@
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-#define RECON_AND_STORE4X4(dest, in_x) \
-{                                                     \
-  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-  d0 = _mm_unpacklo_epi8(d0, zero); \
-  d0 = _mm_add_epi16(in_x, d0); \
-  d0 = _mm_packus_epi16(d0, d0); \
-  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
-}
+#define RECON_AND_STORE4X4(dest, in_x)                    \
+  {                                                       \
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+    d0 = _mm_unpacklo_epi8(d0, zero);                     \
+    d0 = _mm_add_epi16(in_x, d0);                         \
+    d0 = _mm_packus_epi16(d0, d0);                        \
+    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
+  }
 
 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
@@ -263,192 +263,189 @@ void iadst4_sse2(__m128i *in) {
   in[1] = _mm_packs_epi32(u[2], u[3]);
 }
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
-                      out0, out1, out2, out3, out4, out5, out6, out7) \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
-                                                        \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
-                                                            \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                      out2, out3, out4, out5, out6, out7)                 \
+  {                                                                       \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
+                                                                          \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
+                                                                          \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
   }
 
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
-                         out0, out1, out2, out3) \
-  {                                              \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
-    \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
-    \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
+  {                                                                      \
+    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
+    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
+    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
+                                                                         \
+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
+                                                                         \
+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
   }
 
 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                            \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+  {                                                      \
+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
+    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
+    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
   }
 
 // Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
-                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      tmp4 = _mm_madd_epi16(lo_1, cst2); \
-      tmp5 = _mm_madd_epi16(hi_1, cst2); \
-      tmp6 = _mm_madd_epi16(lo_1, cst3); \
-      tmp7 = _mm_madd_epi16(hi_1, cst3); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      tmp4 = _mm_add_epi32(tmp4, rounding); \
-      tmp5 = _mm_add_epi32(tmp5, rounding); \
-      tmp6 = _mm_add_epi32(tmp6, rounding); \
-      tmp7 = _mm_add_epi32(tmp7, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-      res2 = _mm_packs_epi32(tmp4, tmp5); \
-      res3 = _mm_packs_epi32(tmp6, tmp7); \
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
+                               res0, res1, res2, res3)                         \
+  {                                                                            \
+    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
+    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
+    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
+    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
+    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
+    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
+    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
+    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
+                                                                               \
+    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
+    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
+    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
+    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
+    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
+    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
+    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
+    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
+                                                                               \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
+                                                                               \
+    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
+    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
+    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
+    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
   }
 
 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  {                                                                  \
+    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
+    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
+    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
+    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
+                                                                     \
+    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
+    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
+    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
+    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
+                                                                     \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
+                                                                     \
+    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
+    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
   }
 
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
-              out0, out1, out2, out3, out4, out5, out6, out7)  \
-  { \
-  /* Stage1 */      \
-  { \
-    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
-    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
-    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
-    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
-    \
-    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
-                          stg1_1, stg1_2, stg1_3, stp1_4,      \
-                          stp1_7, stp1_5, stp1_6)              \
-  } \
-    \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
-    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
-    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
-    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
-                           stg2_1, stg2_2, stg2_3, stp2_0,     \
-                           stp2_1, stp2_2, stp2_3)             \
-    \
-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
-    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
-    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  } \
-  \
-  /* Stage4  */ \
-  out0 = _mm_adds_epi16(stp1_0, stp2_7); \
-  out1 = _mm_adds_epi16(stp1_1, stp1_6); \
-  out2 = _mm_adds_epi16(stp1_2, stp1_5); \
-  out3 = _mm_adds_epi16(stp1_3, stp2_4); \
-  out4 = _mm_subs_epi16(stp1_3, stp2_4); \
-  out5 = _mm_subs_epi16(stp1_2, stp1_5); \
-  out6 = _mm_subs_epi16(stp1_1, stp1_6); \
-  out7 = _mm_subs_epi16(stp1_0, stp2_7); \
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
+              out4, out5, out6, out7)                                         \
+  {                                                                           \
+    /* Stage1 */                                                              \
+    {                                                                         \
+      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
+      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
+      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
+      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
+                                                                              \
+      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
+                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
+    }                                                                         \
+                                                                              \
+    /* Stage2 */                                                              \
+    {                                                                         \
+      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
+      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
+      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
+      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
+                                                                              \
+      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
+                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
+                                                                              \
+      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
+      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
+      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
+      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
+    }                                                                         \
+                                                                              \
+    /* Stage3 */                                                              \
+    {                                                                         \
+      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
+      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
+                                                                              \
+      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
+      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
+      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
+      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
+                                                                              \
+      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
+      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
+      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
+      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
+                                                                              \
+      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
+      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
+      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
+      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
+                                                                              \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
+                                                                              \
+      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
+      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
+    }                                                                         \
+                                                                              \
+    /* Stage4  */                                                             \
+    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
+    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
+    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
+    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
+    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
+    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
+    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
+    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
   }
 
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -484,12 +481,12 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
   // 2-D
   for (i = 0; i < 2; i++) {
     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
-                  in0, in1, in2, in3, in4, in5, in6, in7);
+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                  in4, in5, in6, in7);
 
     // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-          in0, in1, in2, in3, in4, in5, in6, in7);
+    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
+          in6, in7);
   }
 
   // Final rounding and shift
@@ -560,12 +557,12 @@ void idct8_sse2(__m128i *in) {
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
-                in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
+                in1, in2, in3, in4, in5, in6, in7);
 
   // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
+        in[4], in[5], in[6], in[7]);
 }
 
 void iadst8_sse2(__m128i *in) {
@@ -906,8 +903,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
 
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
-        in0, in1, in2, in3, in4, in5, in6, in7);
+  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
+        in5, in6, in7);
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
   in1 = _mm_adds_epi16(in1, final_rounding);
@@ -937,242 +934,234 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
   RECON_AND_STORE(dest + 7 * stride, in7);
 }
 
-#define IDCT16 \
-  /* Stage2 */ \
-  { \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
-                           stg2_0, stg2_1, stg2_2, stg2_3, \
-                           stp2_8, stp2_15, stp2_9, stp2_14) \
-    \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
-                           stg2_4, stg2_5, stg2_6, stg2_7, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
-  } \
-    \
-  /* Stage3 */ \
-  { \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
-    \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
-                           stg3_0, stg3_1, stg3_2, stg3_3, \
-                           stp1_4, stp1_7, stp1_5, stp1_6) \
-    \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-    \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  } \
-  \
-  /* Stage4 */ \
-  { \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
-    \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
-                           stg4_0, stg4_1, stg4_2, stg4_3, \
-                           stp2_0, stp2_1, stp2_2, stp2_3) \
-    \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-    \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                           stg4_4, stg4_5, stg4_6, stg4_7, \
-                           stp2_9, stp2_14, stp2_10, stp2_13) \
-  } \
-    \
-  /* Stage5 */ \
-  { \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-    \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-    \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-    \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-    \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-    \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-  } \
-    \
-  /* Stage6 */ \
-  { \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-    \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-    \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                           stg6_0, stg4_0, stg6_0, stg4_0, \
-                           stp2_10, stp2_13, stp2_11, stp2_12) \
+#define IDCT16                                                                 \
+  /* Stage2 */                                                                 \
+  {                                                                            \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
+                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
+                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
+  }                                                                            \
+                                                                               \
+  /* Stage3 */                                                                 \
+  {                                                                            \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
+                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
+                                                                               \
+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
+                                                                               \
+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
+  }                                                                            \
+                                                                               \
+  /* Stage4 */                                                                 \
+  {                                                                            \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
+                                                                               \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
+                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
+                                                                               \
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
+                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
+                           stp2_13)                                            \
+  }                                                                            \
+                                                                               \
+  /* Stage5 */                                                                 \
+  {                                                                            \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
+                                                                               \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
+                                                                               \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
+                                                                               \
+    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
+    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
+    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
+    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
+                                                                               \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
+                                                                               \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+                                                                               \
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
+                                                                               \
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
+  }                                                                            \
+                                                                               \
+  /* Stage6 */                                                                 \
+  {                                                                            \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
+                                                                               \
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
+                           stp2_12)                                            \
   }
 
-#define IDCT16_10 \
-    /* Stage2 */ \
-    { \
-      const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
-      const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
-      const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
-      const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
-      \
-      MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
-                             stg2_0, stg2_1, stg2_6, stg2_7, \
-                             stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
-    } \
-      \
-    /* Stage3 */ \
-    { \
-      const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
-      const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
-                               stg3_0, stg3_1,  \
-                               stp2_4, stp2_7) \
-      \
-      stp1_9  =  stp1_8_0; \
-      stp1_10 =  stp1_11;  \
-      \
-      stp1_13 = stp1_12_0; \
-      stp1_14 = stp1_15;   \
-    } \
-    \
-    /* Stage4 */ \
-    { \
-      const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
-      const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
-      \
-      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      \
-      MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
-                               stg4_0, stg4_1, \
-                               stp1_0, stp1_1) \
-      stp2_5 = stp2_4; \
-      stp2_6 = stp2_7; \
-      \
-      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
-                             stg4_4, stg4_5, stg4_6, stg4_7, \
-                             stp2_9, stp2_14, stp2_10, stp2_13) \
-    } \
-      \
-    /* Stage5 */ \
-    { \
-      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-      \
-      stp1_2 = stp1_1; \
-      stp1_3 = stp1_0; \
-      \
-      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-      \
-      stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
-      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
-      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
-      stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-      \
-      stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
-      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
-      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
-      stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-    } \
-      \
-    /* Stage6 */ \
-    { \
-      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-      \
-      stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
-      stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-      stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-      stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
-      stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
-      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-      stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-      \
-      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                             stg6_0, stg4_0, stg6_0, stg4_0, \
-                             stp2_10, stp2_13, stp2_11, stp2_12) \
-    }
+#define IDCT16_10                                                              \
+  /* Stage2 */                                                                 \
+  {                                                                            \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
+                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
+                           stp1_12_0)                                          \
+  }                                                                            \
+                                                                               \
+  /* Stage3 */                                                                 \
+  {                                                                            \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
+                                                                               \
+    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
+                                                                               \
+    stp1_9 = stp1_8_0;                                                         \
+    stp1_10 = stp1_11;                                                         \
+                                                                               \
+    stp1_13 = stp1_12_0;                                                       \
+    stp1_14 = stp1_15;                                                         \
+  }                                                                            \
+                                                                               \
+  /* Stage4 */                                                                 \
+  {                                                                            \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
+                                                                               \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
+                                                                               \
+    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
+    stp2_5 = stp2_4;                                                           \
+    stp2_6 = stp2_7;                                                           \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
+                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
+                           stp2_13)                                            \
+  }                                                                            \
+                                                                               \
+  /* Stage5 */                                                                 \
+  {                                                                            \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
+                                                                               \
+    stp1_2 = stp1_1;                                                           \
+    stp1_3 = stp1_0;                                                           \
+                                                                               \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
+                                                                               \
+    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
+    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
+    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
+    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
+                                                                               \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
+                                                                               \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+                                                                               \
+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
+                                                                               \
+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
+  }                                                                            \
+                                                                               \
+  /* Stage6 */                                                                 \
+  {                                                                            \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
+                                                                               \
+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
+                           stp2_12)                                            \
+  }
 
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
@@ -1207,10 +1196,10 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   __m128i in[16], l[16], r[16], *curr1;
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
+      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+      stp1_8_0, stp1_12_0;
   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
 
@@ -1312,8 +1301,8 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   dc_value = _mm_set1_epi16(a);
 
   for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest +  0, dc_value);
-    RECON_AND_STORE(dest +  8, dc_value);
+    RECON_AND_STORE(dest + 0, dc_value);
+    RECON_AND_STORE(dest + 8, dc_value);
     dest += stride;
   }
 }
@@ -1891,9 +1880,9 @@ static void idct16_8col(__m128i *in) {
   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
 
-  s[8]  = _mm_packs_epi32(u[0], u[1]);
+  s[8] = _mm_packs_epi32(u[0], u[1]);
   s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9]  = _mm_packs_epi32(u[4], u[5]);
+  s[9] = _mm_packs_epi32(u[4], u[5]);
   s[14] = _mm_packs_epi32(u[6], u[7]);
   s[10] = _mm_packs_epi32(u[8], u[9]);
   s[13] = _mm_packs_epi32(u[10], u[11]);
@@ -2021,7 +2010,7 @@ static void idct16_8col(__m128i *in) {
   s[7] = _mm_add_epi16(t[6], t[7]);
   s[8] = t[8];
   s[15] = t[15];
-  s[9]  = _mm_packs_epi32(u[8], u[9]);
+  s[9] = _mm_packs_epi32(u[8], u[9]);
   s[14] = _mm_packs_epi32(u[10], u[11]);
   s[10] = _mm_packs_epi32(u[12], u[13]);
   s[13] = _mm_packs_epi32(u[14], u[15]);
@@ -2167,11 +2156,11 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
+      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
+      stp1_12_0;
   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
   // First 1-D inverse DCT
@@ -2203,7 +2192,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
 
-    stp2_8  = _mm_packs_epi32(tmp0, tmp2);
+    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
     stp2_11 = _mm_packs_epi32(tmp5, tmp7);
   }
 
@@ -2267,9 +2256,9 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
 
-    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
+    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
+    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
 
     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
@@ -2381,650 +2370,647 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-#define LOAD_DQCOEFF(reg, input) \
-  {  \
+#define LOAD_DQCOEFF(reg, input)  \
+  {                               \
     reg = load_input_data(input); \
-    input += 8; \
-  }  \
-
-#define IDCT32_34 \
-/* Stage1 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
-  \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
-  \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
-                         stg1_1, stp1_16, stp1_31); \
-  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
-                         stg1_7, stp1_19, stp1_28); \
-  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
-                         stg1_9, stp1_20, stp1_27); \
-  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
-                         stg1_15, stp1_23, stp1_24); \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
-  \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
-                         stg2_1, stp2_8, stp2_15); \
-  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
-                         stg2_7, stp2_11, stp2_12); \
-  \
-  stp2_16 = stp1_16; \
-  stp2_19 = stp1_19; \
-  \
-  stp2_20 = stp1_20; \
-  stp2_23 = stp1_23; \
-  \
-  stp2_24 = stp1_24; \
-  stp2_27 = stp1_27; \
-  \
-  stp2_28 = stp1_28; \
-  stp2_31 = stp1_31; \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
-                         stg3_1, stp1_4, stp1_7); \
-  \
-  stp1_8 = stp2_8; \
-  stp1_11 = stp2_11; \
-  stp1_12 = stp2_12; \
-  stp1_15 = stp2_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i zero = _mm_setzero_si128();\
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
-  \
-  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
-                         stg4_1, stp2_0, stp2_1); \
-  \
-  stp2_4 = stp1_4; \
-  stp2_5 = stp1_4; \
-  stp2_6 = stp1_7; \
-  stp2_7 = stp1_7; \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = stp2_0; \
-  stp1_1 = stp2_1; \
-  stp1_2 = stp2_1; \
-  stp1_3 = stp2_0; \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
+    input += 8;                   \
+  }
 
+#define IDCT32_34                                                              \
+  /* Stage1 */                                                                 \
+  {                                                                            \
+    const __m128i zero = _mm_setzero_si128();                                  \
+    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
+    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
+                                                                               \
+    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
+    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
+                                                                               \
+    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
+    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
+                                                                               \
+    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
+    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
+                                                                               \
+    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
+                             stp1_31);                                         \
+    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
+                             stp1_28);                                         \
+    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
+                             stp1_27);                                         \
+    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
+                             stp1_24);                                         \
+  }                                                                            \
+                                                                               \
+  /* Stage2 */                                                                 \
+  {                                                                            \
+    const __m128i zero = _mm_setzero_si128();                                  \
+    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
+    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
+                                                                               \
+    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
+    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
+                                                                               \
+    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
+                             stp2_15);                                         \
+    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
+                             stp2_12);                                         \
+                                                                               \
+    stp2_16 = stp1_16;                                                         \
+    stp2_19 = stp1_19;                                                         \
+                                                                               \
+    stp2_20 = stp1_20;                                                         \
+    stp2_23 = stp1_23;                                                         \
+                                                                               \
+    stp2_24 = stp1_24;                                                         \
+    stp2_27 = stp1_27;                                                         \
+                                                                               \
+    stp2_28 = stp1_28;                                                         \
+    stp2_31 = stp1_31;                                                         \
+  }                                                                            \
+                                                                               \
+  /* Stage3 */                                                                 \
+  {                                                                            \
+    const __m128i zero = _mm_setzero_si128();                                  \
+    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
+    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
+                                                                               \
+    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
+    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
+    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
+    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
+                                                                               \
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
+    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
+    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
+                                                                               \
+    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
+                             stp1_7);                                          \
+                                                                               \
+    stp1_8 = stp2_8;                                                           \
+    stp1_11 = stp2_11;                                                         \
+    stp1_12 = stp2_12;                                                         \
+    stp1_15 = stp2_15;                                                         \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
+                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
+                           stp1_29)                                            \
+    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
+                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
+                           stp1_25)                                            \
+                                                                               \
+    stp1_16 = stp2_16;                                                         \
+    stp1_31 = stp2_31;                                                         \
+    stp1_19 = stp2_19;                                                         \
+    stp1_20 = stp2_20;                                                         \
+    stp1_23 = stp2_23;                                                         \
+    stp1_24 = stp2_24;                                                         \
+    stp1_27 = stp2_27;                                                         \
+    stp1_28 = stp2_28;                                                         \
+  }                                                                            \
+                                                                               \
+  /* Stage4 */                                                                 \
+  {                                                                            \
+    const __m128i zero = _mm_setzero_si128();                                  \
+    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
+    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
+                                                                               \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
+                                                                               \
+    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
+                             stp2_1);                                          \
+                                                                               \
+    stp2_4 = stp1_4;                                                           \
+    stp2_5 = stp1_4;                                                           \
+    stp2_6 = stp1_7;                                                           \
+    stp2_7 = stp1_7;                                                           \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
+                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
+                           stp2_13)                                            \
+                                                                               \
+    stp2_8 = stp1_8;                                                           \
+    stp2_15 = stp1_15;                                                         \
+    stp2_11 = stp1_11;                                                         \
+    stp2_12 = stp1_12;                                                         \
+                                                                               \
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
+    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
+    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
+    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
+    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
+    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
+    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
+    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
+                                                                               \
+    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
+    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
+    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
+    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
+    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
+    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
+    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
+    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
+  }                                                                            \
+                                                                               \
+  /* Stage5 */                                                                 \
+  {                                                                            \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
+    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
+    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
+                                                                               \
+    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
+    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
+    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
+    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
+                                                                               \
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
+                                                                               \
+    stp1_0 = stp2_0;                                                           \
+    stp1_1 = stp2_1;                                                           \
+    stp1_2 = stp2_1;                                                           \
+    stp1_3 = stp2_0;                                                           \
+                                                                               \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
+                                                                               \
+    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
+    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
+    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
+    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
+                                                                               \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
+                                                                               \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+                                                                               \
+    stp1_4 = stp2_4;                                                           \
+    stp1_7 = stp2_7;                                                           \
+                                                                               \
+    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
+    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
+    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
+                                                                               \
+    stp1_16 = stp2_16;                                                         \
+    stp1_17 = stp2_17;                                                         \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
+                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
+                           stp1_28)                                            \
+    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
+                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
+                           stp1_26)                                            \
+                                                                               \
+    stp1_22 = stp2_22;                                                         \
+    stp1_23 = stp2_23;                                                         \
+    stp1_24 = stp2_24;                                                         \
+    stp1_25 = stp2_25;                                                         \
+    stp1_30 = stp2_30;                                                         \
+    stp1_31 = stp2_31;                                                         \
+  }                                                                            \
+                                                                               \
+  /* Stage6 */                                                                 \
+  {                                                                            \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
+                                                                               \
+    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
+    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
+                                                                               \
+    stp2_8 = stp1_8;                                                           \
+    stp2_9 = stp1_9;                                                           \
+    stp2_14 = stp1_14;                                                         \
+    stp2_15 = stp1_15;                                                         \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
+                           stp2_12)                                            \
+                                                                               \
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
+    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
+    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
+    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
+    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
+    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
+    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
+    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
+                                                                               \
+    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
+    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
+    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
+    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
+    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
+    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
+    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
+    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
+  }                                                                            \
+                                                                               \
+  /* Stage7 */                                                                 \
+  {                                                                            \
+    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
+    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
+                                                                               \
+    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
+    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
+    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
+    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
+                                                                               \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
+    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
+    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
+    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
+    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
+    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
+    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
+    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
+    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
+    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
+    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
+    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
+    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
+    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
+    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
+                                                                               \
+    stp1_16 = stp2_16;                                                         \
+    stp1_17 = stp2_17;                                                         \
+    stp1_18 = stp2_18;                                                         \
+    stp1_19 = stp2_19;                                                         \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
+                           stp1_26)                                            \
+    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
+                           stp1_24)                                            \
+                                                                               \
+    stp1_28 = stp2_28;                                                         \
+    stp1_29 = stp2_29;                                                         \
+    stp1_30 = stp2_30;                                                         \
+    stp1_31 = stp2_31;                                                         \
+  }
 
-#define IDCT32 \
-/* Stage1 */ \
-{ \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
-  \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
-  \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
-                         stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
-                         stp1_17, stp1_30) \
-  MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
-                         stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
-                         stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
-                         stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-} \
-\
-/* Stage2 */ \
-{ \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
-  \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
-  \
-  MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
-                         stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
-                         stp2_14) \
-  MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
-                         stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
-                         stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
-  stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
-  stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
-  \
-  stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
-  stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
-  stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
-  stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
-  \
-  stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
-  stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
-  stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
-} \
-\
-/* Stage3 */ \
-{ \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
-  \
-  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
-  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  \
-  MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
-                         stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
-                         stp1_6) \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
-  stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
-  stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-  stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
-  stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  \
-  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
-                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
-                         stp1_18, stp1_29) \
-  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
-                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
-                         stp1_22, stp1_25) \
-  \
-  stp1_16 = stp2_16; \
-  stp1_31 = stp2_31; \
-  stp1_19 = stp2_19; \
-  stp1_20 = stp2_20; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_27 = stp2_27; \
-  stp1_28 = stp2_28; \
-} \
-\
-/* Stage4 */ \
-{ \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
-  \
-  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
-  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  \
-  MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
-                         stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
-                         stp2_2, stp2_3) \
-  \
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-  \
-  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
-                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
-                         stp2_10, stp2_13) \
-  \
-  stp2_8 = stp1_8; \
-  stp2_15 = stp1_15; \
-  stp2_11 = stp1_11; \
-  stp2_12 = stp1_12; \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
-  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
-  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
-  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
-  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
-  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
-  \
-  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
-  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
-  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
-  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
-  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
-} \
-\
-/* Stage5 */ \
-{ \
-  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
-  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
-  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
-  \
-  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
-  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-  \
-  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
-  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
-  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
-  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-  \
-  tmp0 = _mm_add_epi32(tmp0, rounding); \
-  tmp1 = _mm_add_epi32(tmp1, rounding); \
-  tmp2 = _mm_add_epi32(tmp2, rounding); \
-  tmp3 = _mm_add_epi32(tmp3, rounding); \
-  \
-  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  \
-  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-  \
-  stp1_4 = stp2_4; \
-  stp1_7 = stp2_7; \
-  \
-  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
-  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
-  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
-  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
-  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  \
-  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
-                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
-                         stp1_19, stp1_28) \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
-                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  \
-  stp1_22 = stp2_22; \
-  stp1_23 = stp2_23; \
-  stp1_24 = stp2_24; \
-  stp1_25 = stp2_25; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-} \
-\
-/* Stage6 */ \
-{ \
-  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
-  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
-  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-  \
-  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
-  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
-  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
-  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
-  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
-  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
-  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
-  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
-  \
-  stp2_8 = stp1_8; \
-  stp2_9 = stp1_9; \
-  stp2_14 = stp1_14; \
-  stp2_15 = stp1_15; \
-  \
-  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
-                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
-                         stp2_13, stp2_11, stp2_12) \
-  \
-  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
-  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
-  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
-  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
-  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
-  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
-  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
-  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
-  \
-  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
-  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
-  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
-  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
-  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
-  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
-  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
-  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
-} \
-\
-/* Stage7 */ \
-{ \
-  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
-  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
-  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
-  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
-  \
-  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
-  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
-  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
-  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
-  \
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
-  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
-  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
-  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
-  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
-  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
-  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
-  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
-  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
-  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
-  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
-  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
-  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
-  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
-  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
-  \
-  stp1_16 = stp2_16; \
-  stp1_17 = stp2_17; \
-  stp1_18 = stp2_18; \
-  stp1_19 = stp2_19; \
-  \
-  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
-                         stp1_21, stp1_26) \
-  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
-                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
-                         stp1_23, stp1_24) \
-  \
-  stp1_28 = stp2_28; \
-  stp1_29 = stp2_29; \
-  stp1_30 = stp2_30; \
-  stp1_31 = stp2_31; \
-}
+#define IDCT32                                                                 \
+  /* Stage1 */                                                                 \
+  {                                                                            \
+    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
+    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
+    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
+    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
+                                                                               \
+    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
+    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
+    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
+    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
+                                                                               \
+    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
+    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
+    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
+    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
+                                                                               \
+    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
+    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
+    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
+    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
+                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
+                           stp1_30)                                            \
+    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
+                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
+    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
+                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
+                           stp1_21, stp1_26)                                   \
+    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
+                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
+                           stp1_23, stp1_24)                                   \
+  }                                                                            \
+                                                                               \
+  /* Stage2 */                                                                 \
+  {                                                                            \
+    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
+    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
+    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
+    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
+                                                                               \
+    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
+    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
+    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
+    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
+                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
+                           stp2_14)                                            \
+    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
+                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
+                           stp2_12)                                            \
+                                                                               \
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
+    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
+    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
+    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
+                                                                               \
+    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
+    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
+    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
+    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
+                                                                               \
+    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
+    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
+    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
+    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
+                                                                               \
+    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
+    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
+    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
+    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
+  }                                                                            \
+                                                                               \
+  /* Stage3 */                                                                 \
+  {                                                                            \
+    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
+    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
+    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
+    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
+                                                                               \
+    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
+    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
+    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
+    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
+                                                                               \
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
+    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
+    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
+                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
+                           stp1_6)                                             \
+                                                                               \
+    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
+    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
+                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
+                           stp1_29)                                            \
+    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
+                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
+                           stp1_25)                                            \
+                                                                               \
+    stp1_16 = stp2_16;                                                         \
+    stp1_31 = stp2_31;                                                         \
+    stp1_19 = stp2_19;                                                         \
+    stp1_20 = stp2_20;                                                         \
+    stp1_23 = stp2_23;                                                         \
+    stp1_24 = stp2_24;                                                         \
+    stp1_27 = stp2_27;                                                         \
+    stp1_28 = stp2_28;                                                         \
+  }                                                                            \
+                                                                               \
+  /* Stage4 */                                                                 \
+  {                                                                            \
+    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
+    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
+    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
+    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
+                                                                               \
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
+                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
+                                                                               \
+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
+                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
+                           stp2_13)                                            \
+                                                                               \
+    stp2_8 = stp1_8;                                                           \
+    stp2_15 = stp1_15;                                                         \
+    stp2_11 = stp1_11;                                                         \
+    stp2_12 = stp1_12;                                                         \
+                                                                               \
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
+    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
+    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
+    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
+    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
+    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
+    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
+    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
+                                                                               \
+    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
+    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
+    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
+    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
+    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
+    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
+    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
+    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
+  }                                                                            \
+                                                                               \
+  /* Stage5 */                                                                 \
+  {                                                                            \
+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
+    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
+    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
+                                                                               \
+    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
+    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
+    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
+    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
+                                                                               \
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
+                                                                               \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
+                                                                               \
+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
+                                                                               \
+    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
+    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
+    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
+    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
+                                                                               \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
+                                                                               \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
+                                                                               \
+    stp1_4 = stp2_4;                                                           \
+    stp1_7 = stp2_7;                                                           \
+                                                                               \
+    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
+    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
+    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
+    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
+                                                                               \
+    stp1_16 = stp2_16;                                                         \
+    stp1_17 = stp2_17;                                                         \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
+                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
+                           stp1_28)                                            \
+    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
+                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
+                           stp1_26)                                            \
+                                                                               \
+    stp1_22 = stp2_22;                                                         \
+    stp1_23 = stp2_23;                                                         \
+    stp1_24 = stp2_24;                                                         \
+    stp1_25 = stp2_25;                                                         \
+    stp1_30 = stp2_30;                                                         \
+    stp1_31 = stp2_31;                                                         \
+  }                                                                            \
+                                                                               \
+  /* Stage6 */                                                                 \
+  {                                                                            \
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
+                                                                               \
+    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
+    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
+    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
+    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
+                                                                               \
+    stp2_8 = stp1_8;                                                           \
+    stp2_9 = stp1_9;                                                           \
+    stp2_14 = stp1_14;                                                         \
+    stp2_15 = stp1_15;                                                         \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
+                           stp2_12)                                            \
+                                                                               \
+    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
+    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
+    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
+    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
+    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
+    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
+    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
+    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
+                                                                               \
+    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
+    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
+    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
+    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
+    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
+    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
+    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
+    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
+  }                                                                            \
+                                                                               \
+  /* Stage7 */                                                                 \
+  {                                                                            \
+    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
+    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
+    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
+    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
+                                                                               \
+    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
+    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
+    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
+    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
+                                                                               \
+    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
+    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
+    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
+    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
+    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
+    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
+    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
+    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
+    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
+    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
+    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
+    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
+    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
+    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
+    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
+    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
+                                                                               \
+    stp1_16 = stp2_16;                                                         \
+    stp1_17 = stp2_17;                                                         \
+    stp1_18 = stp2_18;                                                         \
+    stp1_19 = stp2_19;                                                         \
+                                                                               \
+    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
+                           stp1_26)                                            \
+    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
+                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
+                           stp1_24)                                            \
+                                                                               \
+    stp1_28 = stp2_28;                                                         \
+    stp1_29 = stp2_29;                                                         \
+    stp1_30 = stp2_30;                                                         \
+    stp1_31 = stp2_31;                                                         \
+  }
 
 // Only upper-left 8x8 has non-zero coeff
 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
 
   // idct constants for each stage
   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
@@ -3060,15 +3046,13 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   __m128i in[32], col[32];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
+      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
+      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
 
@@ -3236,15 +3220,13 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   __m128i in[32], col[128], zero_idx[16];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
-          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
-          stp1_30, stp1_31;
+      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
-          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
-          stp2_30, stp2_31;
+      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i, j, i32;
 
@@ -3469,8 +3451,8 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   dc_value = _mm_set1_epi16(a);
 
   for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest +  0 + j * stride, dc_value);
-    RECON_AND_STORE(dest +  8 + j * stride, dc_value);
+    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
+    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
     RECON_AND_STORE(dest + 16 + j * stride, dc_value);
     RECON_AND_STORE(dest + 24 + j * stride, dc_value);
   }
@@ -3595,8 +3577,7 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
     tran_low_t temp_in[4], temp_out[4];
     // Columns
     for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j * 4 + i];
+      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
       vpx_highbd_idct4_c(temp_in, temp_out, bd);
       for (j = 0; j < 4; ++j) {
         dest[j * stride + i] = highbd_clip_pixel_add(
@@ -3685,19 +3666,18 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
       __m128i d[8];
       for (i = 0; i < 8; i++) {
         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
         inptr[i] = _mm_srai_epi16(inptr[i], 5);
         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
         // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
       }
     }
   } else {
     // Run the un-optimised column transform
     tran_low_t temp_in[8], temp_out[8];
     for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j * 8 + i];
+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
       vpx_highbd_idct8_c(temp_in, temp_out, bd);
       for (j = 0; j < 8; ++j) {
         dest[j * stride + i] = highbd_clip_pixel_add(
@@ -3789,19 +3769,18 @@ void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
       __m128i d[8];
       for (i = 0; i < 8; i++) {
         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
         inptr[i] = _mm_srai_epi16(inptr[i], 5);
         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
         // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
       }
     }
   } else {
     // Run the un-optimised column transform
     tran_low_t temp_in[8], temp_out[8];
     for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j * 8 + i];
+      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
       vpx_highbd_idct8_c(temp_in, temp_out, bd);
       for (j = 0; j < 8; ++j) {
         dest[j * stride + i] = highbd_clip_pixel_add(
@@ -3897,25 +3876,24 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
     {
       __m128i d[2];
       for (i = 0; i < 16; i++) {
-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        inptr[i] = _mm_add_epi16(inptr[i], rounding);
+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+        inptr[i] = _mm_srai_epi16(inptr[i], 6);
+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
         // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
       }
     }
   } else {
     // Run the un-optimised column transform
     tran_low_t temp_in[16], temp_out[16];
     for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j * 16 + i];
+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
       vpx_highbd_idct16_c(temp_in, temp_out, bd);
       for (j = 0; j < 16; ++j) {
         dest[j * stride + i] = highbd_clip_pixel_add(
@@ -4016,25 +3994,24 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
     {
       __m128i d[2];
       for (i = 0; i < 16; i++) {
-        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
-        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
-        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
-        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        inptr[i] = _mm_add_epi16(inptr[i], rounding);
+        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
+        inptr[i] = _mm_srai_epi16(inptr[i], 6);
+        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
         // Store
-        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
       }
     }
   } else {
     // Run the un-optimised column transform
     tran_low_t temp_in[16], temp_out[16];
     for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j * 16 + i];
+      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
       vpx_highbd_idct16_c(temp_in, temp_out, bd);
       for (j = 0; j < 16; ++j) {
         dest[j * stride + i] = highbd_clip_pixel_add(
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h
index bd520c18e56fea6363c07f384aed118722156754..d762a04abcd7b0f8d7b8d1ee0fee0c83b43b223c 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -47,16 +47,16 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
   {                                                     \
     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
                                                         \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
   }
 
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
@@ -95,43 +95,43 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
 static INLINE __m128i load_input_data(const tran_low_t *data) {
 #if CONFIG_VP9_HIGHBITDEPTH
   return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-      data[6], data[7]);
+                        data[6], data[7]);
 #else
   return _mm_load_si128((const __m128i *)data);
 #endif
 }
 
 static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0]  = load_input_data(input + 0 * 16);
-  in[1]  = load_input_data(input + 1 * 16);
-  in[2]  = load_input_data(input + 2 * 16);
-  in[3]  = load_input_data(input + 3 * 16);
-  in[4]  = load_input_data(input + 4 * 16);
-  in[5]  = load_input_data(input + 5 * 16);
-  in[6]  = load_input_data(input + 6 * 16);
-  in[7]  = load_input_data(input + 7 * 16);
-
-  in[8]  = load_input_data(input + 8 * 16);
-  in[9]  = load_input_data(input + 9 * 16);
-  in[10]  = load_input_data(input + 10 * 16);
-  in[11]  = load_input_data(input + 11 * 16);
-  in[12]  = load_input_data(input + 12 * 16);
-  in[13]  = load_input_data(input + 13 * 16);
-  in[14]  = load_input_data(input + 14 * 16);
-  in[15]  = load_input_data(input + 15 * 16);
+  in[0] = load_input_data(input + 0 * 16);
+  in[1] = load_input_data(input + 1 * 16);
+  in[2] = load_input_data(input + 2 * 16);
+  in[3] = load_input_data(input + 3 * 16);
+  in[4] = load_input_data(input + 4 * 16);
+  in[5] = load_input_data(input + 5 * 16);
+  in[6] = load_input_data(input + 6 * 16);
+  in[7] = load_input_data(input + 7 * 16);
+
+  in[8] = load_input_data(input + 8 * 16);
+  in[9] = load_input_data(input + 9 * 16);
+  in[10] = load_input_data(input + 10 * 16);
+  in[11] = load_input_data(input + 11 * 16);
+  in[12] = load_input_data(input + 12 * 16);
+  in[13] = load_input_data(input + 13 * 16);
+  in[14] = load_input_data(input + 14 * 16);
+  in[15] = load_input_data(input + 15 * 16);
 }
 
-#define RECON_AND_STORE(dest, in_x) \
-  {                                                     \
-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      d0 = _mm_add_epi16(in_x, d0); \
-      d0 = _mm_packus_epi16(d0, d0); \
-      _mm_storel_epi64((__m128i *)(dest), d0); \
+#define RECON_AND_STORE(dest, in_x)                  \
+  {                                                  \
+    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+    d0 = _mm_unpacklo_epi8(d0, zero);                \
+    d0 = _mm_add_epi16(in_x, d0);                    \
+    d0 = _mm_packus_epi16(d0, d0);                   \
+    _mm_storel_epi64((__m128i *)(dest), d0);         \
   }
 
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
   const __m128i zero = _mm_setzero_si128();
   // Final rounding and shift
   in[0] = _mm_adds_epi16(in[0], final_rounding);
@@ -168,16 +168,16 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
   in[14] = _mm_srai_epi16(in[14], 6);
   in[15] = _mm_srai_epi16(in[15], 6);
 
-  RECON_AND_STORE(dest +  0 * stride, in[0]);
-  RECON_AND_STORE(dest +  1 * stride, in[1]);
-  RECON_AND_STORE(dest +  2 * stride, in[2]);
-  RECON_AND_STORE(dest +  3 * stride, in[3]);
-  RECON_AND_STORE(dest +  4 * stride, in[4]);
-  RECON_AND_STORE(dest +  5 * stride, in[5]);
-  RECON_AND_STORE(dest +  6 * stride, in[6]);
-  RECON_AND_STORE(dest +  7 * stride, in[7]);
-  RECON_AND_STORE(dest +  8 * stride, in[8]);
-  RECON_AND_STORE(dest +  9 * stride, in[9]);
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+  RECON_AND_STORE(dest + 8 * stride, in[8]);
+  RECON_AND_STORE(dest + 9 * stride, in[9]);
   RECON_AND_STORE(dest + 10 * stride, in[10]);
   RECON_AND_STORE(dest + 11 * stride, in[11]);
   RECON_AND_STORE(dest + 12 * stride, in[12]);
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
index be1087c1e951a195ad867f2945853e4211dc8389..85923b4478ac7aa4e7a0f9ee75034b44d8f2cbb2 100644
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <immintrin.h>  /* AVX2 */
+#include <immintrin.h> /* AVX2 */
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
@@ -17,387 +17,353 @@ void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
                                     const unsigned char *_blimit,
                                     const unsigned char *_limit,
                                     const unsigned char *_thresh) {
-    __m128i mask, hev, flat, flat2;
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
-    __m128i abs_p1p0;
-
-    const __m128i thresh = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _thresh[0]));
-    const __m128i limit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _limit[0]));
-    const __m128i blimit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _blimit[0]));
-
-    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
-    q4p4 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
-    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
-    q3p3 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
-    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
-    q2p2 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
-    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
-    q1p1 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
-    p1q1 = _mm_shuffle_epi32(q1p1, 78);
-    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
-    q0p0 = _mm_castps_si128(
-            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
-    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+  __m128i mask, hev, flat, flat2;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i abs_p1p0;
+
+  const __m128i thresh =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
+  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
+  const __m128i blimit =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3p3 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2p2 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1p1 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+    abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
+    abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
+        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t1 = _mm_set1_epi16(0x1);
+    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+    __m128i qs0 = _mm_xor_si128(p0q0, t80);
+    __m128i qs1 = _mm_xor_si128(p1q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, qs0ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 0xB);
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 0xB);
+
+    /* Filter1 >> 3 */
+    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi16(filter1, t1);
+    filt = _mm_srai_epi16(filt, 1);
+    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+                            filt);
+    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+    // loopfilter done
 
     {
-        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
-                _mm_subs_epu8(q0p0, q1p1));
-        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-        fe = _mm_set1_epi8(0xfe);
-        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
-                _mm_subs_epu8(p0q0, q0p0));
-        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
-                _mm_subs_epu8(p1q1, q1p1));
-        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-        hev = _mm_subs_epu8(flat, thresh);
-        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-        mask = _mm_max_epu8(abs_p1p0, mask);
-        // mask |= (abs(p1 - p0) > limit) * -1;
-        // mask |= (abs(q1 - q0) > limit) * -1;
-
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
-                        _mm_subs_epu8(q1p1, q2p2)),
-                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
-                        _mm_subs_epu8(q2p2, q3p3)));
-        mask = _mm_max_epu8(work, mask);
-        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-        mask = _mm_subs_epu8(mask, limit);
-        mask = _mm_cmpeq_epi8(mask, zero);
+      __m128i work;
+      flat = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
+          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
+      flat = _mm_max_epu8(abs_p1p0, flat);
+      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+      flat = _mm_subs_epu8(flat, one);
+      flat = _mm_cmpeq_epi8(flat, zero);
+      flat = _mm_and_si128(flat, mask);
+
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_castps_si128(
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_castps_si128(
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+
+      flat2 = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
+          _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
+
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_castps_si128(
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
+          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
+
+      flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
     }
 
-    // lp filter
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
     {
-        const __m128i t4 = _mm_set1_epi8(4);
-        const __m128i t3 = _mm_set1_epi8(3);
-        const __m128i t80 = _mm_set1_epi8(0x80);
-        const __m128i t1 = _mm_set1_epi16(0x1);
-        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-        __m128i qs0 = _mm_xor_si128(p0q0, t80);
-        __m128i qs1 = _mm_xor_si128(p1q1, t80);
-        __m128i filt;
-        __m128i work_a;
-        __m128i filter1, filter2;
-        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-        work_a = _mm_subs_epi8(qs0, qs0ps0);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
-        filt = _mm_and_si128(filt, mask);
-
-        filter1 = _mm_adds_epi8(filt, t4);
-        filter2 = _mm_adds_epi8(filt, t3);
-
-        filter1 = _mm_unpacklo_epi8(zero, filter1);
-        filter1 = _mm_srai_epi16(filter1, 0xB);
-        filter2 = _mm_unpacklo_epi8(zero, filter2);
-        filter2 = _mm_srai_epi16(filter2, 0xB);
-
-        /* Filter1 >> 3 */
-        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-        /* filt >> 1 */
-        filt = _mm_adds_epi16(filter1, t1);
-        filt = _mm_srai_epi16(filt, 1);
-        filt = _mm_andnot_si128(
-                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
-        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-        // loopfilter done
-
-        {
-            __m128i work;
-            flat = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
-                            _mm_subs_epu8(q0p0, q2p2)),
-                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
-                            _mm_subs_epu8(q0p0, q3p3)));
-            flat = _mm_max_epu8(abs_p1p0, flat);
-            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-            flat = _mm_subs_epu8(flat, one);
-            flat = _mm_cmpeq_epi8(flat, zero);
-            flat = _mm_and_si128(flat, mask);
-
-            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
-            q5p5 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
-                            (__m64 *) (s + 5 * p)));
-
-            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
-            q6p6 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
-                            (__m64 *) (s + 6 * p)));
-
-            flat2 = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
-                            _mm_subs_epu8(q0p0, q4p4)),
-                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
-                            _mm_subs_epu8(q0p0, q5p5)));
-
-            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
-            q7p7 = _mm_castps_si128(
-                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
-                            (__m64 *) (s + 7 * p)));
-
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
-                            _mm_subs_epu8(q0p0, q6p6)),
-                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
-                            _mm_subs_epu8(q0p0, q7p7)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-            flat2 = _mm_subs_epu8(flat2, one);
-            flat2 = _mm_cmpeq_epi8(flat2, zero);
-            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-        }
-
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        // flat and wide flat calculations
-        {
-            const __m128i eight = _mm_set1_epi16(8);
-            const __m128i four = _mm_set1_epi16(4);
-            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-            __m128i pixelFilter_p, pixelFilter_q;
-            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
-            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                    _mm_add_epi16(p4_16, p3_16));
-            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                    _mm_add_epi16(q4_16, q3_16));
-
-            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
-                    _mm_add_epi16(p2_16, p1_16));
-            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
-                    _mm_add_epi16(q2_16, q1_16));
-            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-            pixelFilter_p = _mm_add_epi16(eight,
-                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-            pixetFilter_p2p1p0 = _mm_add_epi16(four,
-                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
-                    4);
-            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(p3_16, p0_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(q3_16, q0_16)), 3);
-
-            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(p7_16, p7_16);
-            sum_q7 = _mm_add_epi16(q7_16, q7_16);
-            sum_p3 = _mm_add_epi16(p3_16, p3_16);
-            sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
-                    4);
-            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(sum_p3, p1_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_q2q1q0,
-                            _mm_add_epi16(sum_q3, q1_16)), 3);
-            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
-                    4);
-            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_p2p1p0,
-                            _mm_add_epi16(sum_p3, p2_16)), 3);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixetFilter_q2q1q0,
-                            _mm_add_epi16(sum_q3, q2_16)), 3);
-            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
-                    4);
-            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
-                    4);
-            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
-                    4);
-            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-            res_p = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
-                    4);
-            res_q = _mm_srli_epi16(
-                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
-                    4);
-            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-        }
-        // wide flat
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-        flat = _mm_shuffle_epi32(flat, 68);
-        flat2 = _mm_shuffle_epi32(flat2, 68);
-
-        q2p2 = _mm_andnot_si128(flat, q2p2);
-        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-        q6p6 = _mm_andnot_si128(flat2, q6p6);
-        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
-        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
-
-        q5p5 = _mm_andnot_si128(flat2, q5p5);
-        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
-        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
-
-        q4p4 = _mm_andnot_si128(flat2, q4p4);
-        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
-        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
-
-        q3p3 = _mm_andnot_si128(flat2, q3p3);
-        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
-        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
-
-        q2p2 = _mm_andnot_si128(flat2, q2p2);
-        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
-        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
-
-        q1p1 = _mm_andnot_si128(flat2, q1p1);
-        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
-        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
-
-        q0p0 = _mm_andnot_si128(flat2, q0p0);
-        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
-        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+      const __m128i eight = _mm_set1_epi16(8);
+      const __m128i four = _mm_set1_epi16(4);
+      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+      __m128i pixelFilter_p, pixelFilter_q;
+      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                                    _mm_add_epi16(p4_16, p3_16));
+      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                                    _mm_add_epi16(q4_16, q3_16));
+
+      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+      pixelFilter_p =
+          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+      pixetFilter_p2p1p0 = _mm_add_epi16(
+          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
+      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(p7_16, p7_16);
+      sum_q7 = _mm_add_epi16(q7_16, q7_16);
+      sum_p3 = _mm_add_epi16(p3_16, p3_16);
+      sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
+      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
+      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
     }
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    flat = _mm_shuffle_epi32(flat, 68);
+    flat2 = _mm_shuffle_epi32(flat2, 68);
+
+    q2p2 = _mm_andnot_si128(flat, q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    q6p6 = _mm_andnot_si128(flat2, q6p6);
+    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+    q5p5 = _mm_andnot_si128(flat2, q5p5);
+    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+    q4p4 = _mm_andnot_si128(flat2, q4p4);
+    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+    q3p3 = _mm_andnot_si128(flat2, q3p3);
+    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+    q2p2 = _mm_andnot_si128(flat2, q2p2);
+    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+    q1p1 = _mm_andnot_si128(flat2, q1p1);
+    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+    q0p0 = _mm_andnot_si128(flat2, q0p0);
+    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+  }
 }
 
 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
-  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+  0, 128, 1, 128, 2,  128, 3,  128, 4,  128, 5,  128, 6,  128, 7,  128,
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
@@ -405,575 +371,543 @@ void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
                                      const unsigned char *_blimit,
                                      const unsigned char *_limit,
                                      const unsigned char *_thresh) {
-    __m128i mask, hev, flat, flat2;
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi8(1);
-    __m128i p7, p6, p5;
-    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-    __m128i q5, q6, q7;
-    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
-            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
-            p256_0, q256_0;
-
-    const __m128i thresh = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _thresh[0]));
-    const __m128i limit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _limit[0]));
-    const __m128i blimit = _mm_broadcastb_epi8(
-            _mm_cvtsi32_si128((int) _blimit[0]));
-
-    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 5 * p)));
-    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 4 * p)));
-    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 3 * p)));
-    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 2 * p)));
-    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 1 * p)));
-    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s - 0 * p)));
-    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 1 * p)));
-    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 2 * p)));
-    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 3 * p)));
-    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                (__m128d const *)(s + 4 * p)));
-
-    p4 = _mm256_castsi256_si128(p256_4);
-    p3 = _mm256_castsi256_si128(p256_3);
-    p2 = _mm256_castsi256_si128(p256_2);
-    p1 = _mm256_castsi256_si128(p256_1);
-    p0 = _mm256_castsi256_si128(p256_0);
-    q0 = _mm256_castsi256_si128(q256_0);
-    q1 = _mm256_castsi256_si128(q256_1);
-    q2 = _mm256_castsi256_si128(q256_2);
-    q3 = _mm256_castsi256_si128(q256_3);
-    q4 = _mm256_castsi256_si128(q256_4);
+  __m128i mask, hev, flat, flat2;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i p7, p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q5, q6, q7;
+  __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
+      p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+
+  const __m128i thresh =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
+  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
+  const __m128i blimit =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+
+  p256_4 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
+  p256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+  p256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+  p256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+  p256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+  q256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+  q256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+  q256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+  q256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+  q256_4 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+
+  p4 = _mm256_castsi256_si128(p256_4);
+  p3 = _mm256_castsi256_si128(p256_3);
+  p2 = _mm256_castsi256_si128(p256_2);
+  p1 = _mm256_castsi256_si128(p256_1);
+  p0 = _mm256_castsi256_si128(p256_0);
+  q0 = _mm256_castsi256_si128(q256_0);
+  q1 = _mm256_castsi256_si128(q256_1);
+  q2 = _mm256_castsi256_si128(q256_2);
+  q3 = _mm256_castsi256_si128(q256_3);
+  q4 = _mm256_castsi256_si128(q256_4);
+
+  {
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+    __m128i work;
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+        flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
+        flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    // loopfilter done
 
     {
-        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                _mm_subs_epu8(p0, p1));
-        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                _mm_subs_epu8(q0, q1));
-        const __m128i fe = _mm_set1_epi8(0xfe);
-        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                _mm_subs_epu8(q0, p0));
-        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                _mm_subs_epu8(q1, p1));
-        __m128i work;
-        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-        hev = _mm_subs_epu8(flat, thresh);
-        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-        mask = _mm_max_epu8(flat, mask);
-        // mask |= (abs(p1 - p0) > limit) * -1;
-        // mask |= (abs(q1 - q0) > limit) * -1;
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-        mask = _mm_max_epu8(work, mask);
-        work = _mm_max_epu8(
-                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-        mask = _mm_max_epu8(work, mask);
-        mask = _mm_subs_epu8(mask, limit);
-        mask = _mm_cmpeq_epi8(mask, zero);
+      __m128i work;
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+          _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+          _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+      flat = _mm_max_epu8(work, flat);
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+          _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+      flat = _mm_subs_epu8(flat, one);
+      flat = _mm_cmpeq_epi8(flat, zero);
+      flat = _mm_and_si128(flat, mask);
+
+      p256_5 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+      q256_5 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+      p5 = _mm256_castsi256_si128(p256_5);
+      q5 = _mm256_castsi256_si128(q256_5);
+      flat2 = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+          _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+      flat2 = _mm_max_epu8(work, flat2);
+      p256_6 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+      q256_6 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+      p6 = _mm256_castsi256_si128(p256_6);
+      q6 = _mm256_castsi256_si128(q256_6);
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+          _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+      flat2 = _mm_max_epu8(work, flat2);
+
+      p256_7 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+      q256_7 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+      p7 = _mm256_castsi256_si128(p256_7);
+      q7 = _mm256_castsi256_si128(q256_7);
+      work = _mm_max_epu8(
+          _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+          _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+      flat2 = _mm_max_epu8(work, flat2);
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
     }
 
-    // lp filter
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
     {
-        const __m128i t4 = _mm_set1_epi8(4);
-        const __m128i t3 = _mm_set1_epi8(3);
-        const __m128i t80 = _mm_set1_epi8(0x80);
-        const __m128i te0 = _mm_set1_epi8(0xe0);
-        const __m128i t1f = _mm_set1_epi8(0x1f);
-        const __m128i t1 = _mm_set1_epi8(0x1);
-        const __m128i t7f = _mm_set1_epi8(0x7f);
-
-        __m128i ps1 = _mm_xor_si128(p1, t80);
-        __m128i ps0 = _mm_xor_si128(p0, t80);
-        __m128i qs0 = _mm_xor_si128(q0, t80);
-        __m128i qs1 = _mm_xor_si128(q1, t80);
-        __m128i filt;
-        __m128i work_a;
-        __m128i filter1, filter2;
-        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
-                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
-                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
-                flat_q2;
-
-        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-        work_a = _mm_subs_epi8(qs0, ps0);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        filt = _mm_adds_epi8(filt, work_a);
-        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
-        filt = _mm_and_si128(filt, mask);
-
-        filter1 = _mm_adds_epi8(filt, t4);
-        filter2 = _mm_adds_epi8(filt, t3);
-
-        /* Filter1 >> 3 */
-        work_a = _mm_cmpgt_epi8(zero, filter1);
-        filter1 = _mm_srli_epi16(filter1, 3);
-        work_a = _mm_and_si128(work_a, te0);
-        filter1 = _mm_and_si128(filter1, t1f);
-        filter1 = _mm_or_si128(filter1, work_a);
-        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-
-        /* Filter2 >> 3 */
-        work_a = _mm_cmpgt_epi8(zero, filter2);
-        filter2 = _mm_srli_epi16(filter2, 3);
-        work_a = _mm_and_si128(work_a, te0);
-        filter2 = _mm_and_si128(filter2, t1f);
-        filter2 = _mm_or_si128(filter2, work_a);
-        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-
-        /* filt >> 1 */
-        filt = _mm_adds_epi8(filter1, t1);
-        work_a = _mm_cmpgt_epi8(zero, filt);
-        filt = _mm_srli_epi16(filt, 1);
-        work_a = _mm_and_si128(work_a, t80);
-        filt = _mm_and_si128(filt, t7f);
-        filt = _mm_or_si128(filt, work_a);
-        filt = _mm_andnot_si128(hev, filt);
-        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-        // loopfilter done
-
-        {
-            __m128i work;
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-            flat = _mm_max_epu8(work, flat);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-            flat = _mm_max_epu8(work, flat);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
-                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
-            flat = _mm_subs_epu8(flat, one);
-            flat = _mm_cmpeq_epi8(flat, zero);
-            flat = _mm_and_si128(flat, mask);
-
-            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 6 * p)));
-            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 5 * p)));
-            p5 = _mm256_castsi256_si128(p256_5);
-            q5 = _mm256_castsi256_si128(q256_5);
-            flat2 = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
-                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 7 * p)));
-            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 6 * p)));
-            p6 = _mm256_castsi256_si128(p256_6);
-            q6 = _mm256_castsi256_si128(q256_6);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
-                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-
-            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s - 8 * p)));
-            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
-                                        (__m128d const *)(s + 7 * p)));
-            p7 = _mm256_castsi256_si128(p256_7);
-            q7 = _mm256_castsi256_si128(q256_7);
-            work = _mm_max_epu8(
-                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
-                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
-
-            flat2 = _mm_max_epu8(work, flat2);
-            flat2 = _mm_subs_epu8(flat2, one);
-            flat2 = _mm_cmpeq_epi8(flat2, zero);
-            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-        }
-
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        // flat and wide flat calculations
-        {
-            const __m256i eight = _mm256_set1_epi16(8);
-            const __m256i four = _mm256_set1_epi16(4);
-            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
-                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
-                    res_q;
-
-            const __m256i filter = _mm256_load_si256(
-                                  (__m256i const *)filt_loopfilter_avx2);
-            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
-            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
-            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
-            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
-            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
-            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
-            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
-            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
-            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
-            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
-            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
-            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
-            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
-            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
-            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
-            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
-
-            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
-                    _mm256_add_epi16(p256_4, p256_3));
-            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
-                    _mm256_add_epi16(q256_4, q256_3));
-
-            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
-                    _mm256_add_epi16(p256_2, p256_1));
-            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
-                    _mm256_add_epi16(q256_2, q256_1));
-            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-
-            pixelFilter_p = _mm256_add_epi16(eight,
-                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
-
-            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
-                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(p256_7, p256_0)), 4);
-
-            flat2_p0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
-
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(q256_7, q256_0)), 4);
-
-            flat2_q0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(p256_3, p256_0)), 3);
+      const __m256i eight = _mm256_set1_epi16(8);
+      const __m256i four = _mm256_set1_epi16(4);
+      __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+          pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
 
-            flat_p0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      const __m256i filter =
+          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+      p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+      p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+      p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+      p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+      p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+      q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+      q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+      q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+      q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+      q256_7 = _mm256_shuffle_epi8(q256_7, filter);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(q256_3, q256_0)), 3);
+      pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                                       _mm256_add_epi16(p256_4, p256_3));
+      pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                                       _mm256_add_epi16(q256_4, q256_3));
 
-            flat_q0 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      pixetFilter_p2p1p0 =
+          _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+      pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
 
-            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+      pixetFilter_q2q1q0 =
+          _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+      pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
 
-            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+      pixelFilter_p = _mm256_add_epi16(
+          eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
 
-            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(
+          four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
 
-            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+      res_p = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
 
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+      flat2_p0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+      res_q = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+      flat2_q0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            flat2_p1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+                                             _mm256_add_epi16(p256_3, p256_0)),
+                            3);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+      flat_p0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            flat2_q1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+                                             _mm256_add_epi16(q256_3, q256_0)),
+                            3);
 
-            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+      flat_q0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+      sum_p7 = _mm256_add_epi16(p256_7, p256_7);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+      sum_q7 = _mm256_add_epi16(q256_7, q256_7);
 
-            flat_p1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      sum_p3 = _mm256_add_epi16(p256_3, p256_3);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_q2q1q0,
-                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+      sum_q3 = _mm256_add_epi16(q256_3, q256_3);
 
-            flat_q1 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
 
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
 
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+      res_p = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
 
-            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+      flat2_p1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+      res_q = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
 
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+      flat2_q1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+      pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+      pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
 
-            flat2_p2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+                                             _mm256_add_epi16(sum_p3, p256_1)),
+                            3);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+      flat_p1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            flat2_q2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+                                             _mm256_add_epi16(sum_q3, q256_1)),
+                            3);
 
-            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+      flat_q1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_p2p1p0,
-                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
 
-            flat_p2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixetFilter_q2q1q0,
-                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+      sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
 
-            flat_q2 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
 
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
 
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+      res_p = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
 
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+      flat2_p2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+      res_q = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+      flat2_q2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            flat2_p3 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+      pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
 
-            flat2_q3 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
+                                             _mm256_add_epi16(sum_p3, p256_2)),
+                            3);
 
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+      flat_p2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
+                                             _mm256_add_epi16(sum_q3, q256_2)),
+                            3);
 
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+      flat_q2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
 
-            flat2_p4 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
 
-            flat2_q4 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      res_p = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
 
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+      flat2_p3 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+      res_q = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
 
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+      flat2_q3 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
 
-            flat2_p5 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
 
-            flat2_q5 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
+      res_p = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
 
-            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+      flat2_p4 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+      res_q = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
 
-            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+      flat2_q4 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
 
-            res_p = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_p,
-                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
 
-            flat2_p6 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
-                            168));
+      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
 
-            res_q = _mm256_srli_epi16(
-                    _mm256_add_epi16(pixelFilter_q,
-                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
 
-            flat2_q6 = _mm256_castsi256_si128(
-                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
-                            168));
-        }
+      res_p = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
 
-        // wide flat
-        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      flat2_p5 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-        p2 = _mm_andnot_si128(flat, p2);
-        flat_p2 = _mm_and_si128(flat, flat_p2);
-        p2 = _mm_or_si128(flat_p2, p2);
+      res_q = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
 
-        p1 = _mm_andnot_si128(flat, ps1);
-        flat_p1 = _mm_and_si128(flat, flat_p1);
-        p1 = _mm_or_si128(flat_p1, p1);
+      flat2_q5 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
 
-        p0 = _mm_andnot_si128(flat, ps0);
-        flat_p0 = _mm_and_si128(flat, flat_p0);
-        p0 = _mm_or_si128(flat_p0, p0);
+      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
 
-        q0 = _mm_andnot_si128(flat, qs0);
-        flat_q0 = _mm_and_si128(flat, flat_q0);
-        q0 = _mm_or_si128(flat_q0, q0);
+      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
 
-        q1 = _mm_andnot_si128(flat, qs1);
-        flat_q1 = _mm_and_si128(flat, flat_q1);
-        q1 = _mm_or_si128(flat_q1, q1);
+      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
 
-        q2 = _mm_andnot_si128(flat, q2);
-        flat_q2 = _mm_and_si128(flat, flat_q2);
-        q2 = _mm_or_si128(flat_q2, q2);
+      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
 
-        p6 = _mm_andnot_si128(flat2, p6);
-        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
-        p6 = _mm_or_si128(flat2_p6, p6);
-        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+      res_p = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
 
-        p5 = _mm_andnot_si128(flat2, p5);
-        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
-        p5 = _mm_or_si128(flat2_p5, p5);
-        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+      flat2_p6 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
 
-        p4 = _mm_andnot_si128(flat2, p4);
-        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
-        p4 = _mm_or_si128(flat2_p4, p4);
-        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+      res_q = _mm256_srli_epi16(
+          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
 
-        p3 = _mm_andnot_si128(flat2, p3);
-        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
-        p3 = _mm_or_si128(flat2_p3, p3);
-        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
-
-        p2 = _mm_andnot_si128(flat2, p2);
-        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
-        p2 = _mm_or_si128(flat2_p2, p2);
-        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
-
-        p1 = _mm_andnot_si128(flat2, p1);
-        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
-        p1 = _mm_or_si128(flat2_p1, p1);
-        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
-
-        p0 = _mm_andnot_si128(flat2, p0);
-        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
-        p0 = _mm_or_si128(flat2_p0, p0);
-        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
-
-        q0 = _mm_andnot_si128(flat2, q0);
-        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
-        q0 = _mm_or_si128(flat2_q0, q0);
-        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
-
-        q1 = _mm_andnot_si128(flat2, q1);
-        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
-        q1 = _mm_or_si128(flat2_q1, q1);
-        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
-
-        q2 = _mm_andnot_si128(flat2, q2);
-        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
-        q2 = _mm_or_si128(flat2_q2, q2);
-        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
-
-        q3 = _mm_andnot_si128(flat2, q3);
-        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
-        q3 = _mm_or_si128(flat2_q3, q3);
-        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
-
-        q4 = _mm_andnot_si128(flat2, q4);
-        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
-        q4 = _mm_or_si128(flat2_q4, q4);
-        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
-
-        q5 = _mm_andnot_si128(flat2, q5);
-        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
-        q5 = _mm_or_si128(flat2_q5, q5);
-        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
-
-        q6 = _mm_andnot_si128(flat2, q6);
-        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
-        q6 = _mm_or_si128(flat2_q6, q6);
-        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+      flat2_q6 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
     }
+
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    p2 = _mm_andnot_si128(flat, p2);
+    flat_p2 = _mm_and_si128(flat, flat_p2);
+    p2 = _mm_or_si128(flat_p2, p2);
+
+    p1 = _mm_andnot_si128(flat, ps1);
+    flat_p1 = _mm_and_si128(flat, flat_p1);
+    p1 = _mm_or_si128(flat_p1, p1);
+
+    p0 = _mm_andnot_si128(flat, ps0);
+    flat_p0 = _mm_and_si128(flat, flat_p0);
+    p0 = _mm_or_si128(flat_p0, p0);
+
+    q0 = _mm_andnot_si128(flat, qs0);
+    flat_q0 = _mm_and_si128(flat, flat_q0);
+    q0 = _mm_or_si128(flat_q0, q0);
+
+    q1 = _mm_andnot_si128(flat, qs1);
+    flat_q1 = _mm_and_si128(flat, flat_q1);
+    q1 = _mm_or_si128(flat_q1, q1);
+
+    q2 = _mm_andnot_si128(flat, q2);
+    flat_q2 = _mm_and_si128(flat, flat_q2);
+    q2 = _mm_or_si128(flat_q2, q2);
+
+    p6 = _mm_andnot_si128(flat2, p6);
+    flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+    p6 = _mm_or_si128(flat2_p6, p6);
+    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+
+    p5 = _mm_andnot_si128(flat2, p5);
+    flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+    p5 = _mm_or_si128(flat2_p5, p5);
+    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+    p4 = _mm_andnot_si128(flat2, p4);
+    flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+    p4 = _mm_or_si128(flat2_p4, p4);
+    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+    p3 = _mm_andnot_si128(flat2, p3);
+    flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+    p3 = _mm_or_si128(flat2_p3, p3);
+    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+    p2 = _mm_andnot_si128(flat2, p2);
+    flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+    p2 = _mm_or_si128(flat2_p2, p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+    p1 = _mm_andnot_si128(flat2, p1);
+    flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+    p1 = _mm_or_si128(flat2_p1, p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+    p0 = _mm_andnot_si128(flat2, p0);
+    flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+    p0 = _mm_or_si128(flat2_p0, p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+    q0 = _mm_andnot_si128(flat2, q0);
+    flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+    q0 = _mm_or_si128(flat2_q0, q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+    q1 = _mm_andnot_si128(flat2, q1);
+    flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+    q1 = _mm_or_si128(flat2_q1, q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+    q2 = _mm_andnot_si128(flat2, q2);
+    flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+    q2 = _mm_or_si128(flat2_q2, q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+    q3 = _mm_andnot_si128(flat2, q3);
+    flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+    q3 = _mm_or_si128(flat2_q3, q3);
+    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+    q4 = _mm_andnot_si128(flat2, q4);
+    flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+    q4 = _mm_or_si128(flat2_q4, q4);
+    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+    q5 = _mm_andnot_si128(flat2, q5);
+    flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+    q5 = _mm_or_si128(flat2_q5, q5);
+    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+
+    q6 = _mm_andnot_si128(flat2, q6);
+    flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+    q6 = _mm_or_si128(flat2_q6, q6);
+    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+  }
 }
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index 739adf31d067912d2c8c047f744800e7927ac70b..e13334ae0ea87c1d0f4f69950cf758674cf416f4 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -19,84 +19,89 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
 }
 
 // filter_mask and hev_mask
-#define FILTER_HEV_MASK do {                                                   \
-  /* (abs(q1 - q0), abs(p1 - p0) */                                            \
-  __m128i flat = abs_diff(q1p1, q0p0);                                         \
-  /* abs(p1 - q1), abs(p0 - q0) */                                             \
-  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                           \
-  __m128i abs_p0q0, abs_p1q1, work;                                            \
-                                                                               \
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */          \
-  hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero);  \
-  hev = _mm_cmpgt_epi16(hev, thresh);                                          \
-  hev = _mm_packs_epi16(hev, hev);                                             \
-                                                                               \
-  /* const int8_t mask = filter_mask(*limit, *blimit, */                       \
-  /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */       \
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p0 - q0) * 2 */\
-  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p1 - q1) */\
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                      \
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1);  /* abs(p1 - q1) / 2 */      \
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                    \
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                    \
-  /* abs(p3 - p2), abs(p2 - p1) */                                             \
-  work = abs_diff(p3p2, p2p1);                                                 \
-  flat = _mm_max_epu8(work, flat);                                             \
-  /* abs(q3 - q2), abs(q2 - q1) */                                             \
-  work = abs_diff(q3q2, q2q1);                                                 \
-  flat = _mm_max_epu8(work, flat);                                             \
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                          \
-  mask = _mm_unpacklo_epi64(mask, flat);                                       \
-  mask = _mm_subs_epu8(mask, limit);                                           \
-  mask = _mm_cmpeq_epi8(mask, zero);                                           \
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                         \
-} while (0)
-
-#define FILTER4 do {                                                           \
-  const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3,                    \
-                                    4, 4, 4, 4, 4, 4, 4, 4);                   \
-  const __m128i t80 = _mm_set1_epi8(0x80);                                     \
-  __m128i filter, filter2filter1, work;                                        \
-                                                                               \
-  ps1ps0 = _mm_xor_si128(p1p0, t80);  /* ^ 0x80 */                             \
-  qs1qs0 = _mm_xor_si128(q1q0, t80);                                           \
-                                                                               \
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */                    \
-  work = _mm_subs_epi8(ps1ps0, qs1qs0);                                        \
-  filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                        \
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */           \
-  filter = _mm_subs_epi8(filter, work);                                        \
-  filter = _mm_subs_epi8(filter, work);                                        \
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */               \
-  filter = _mm_and_si128(filter, mask);  /* & mask */                          \
-  filter = _mm_unpacklo_epi64(filter, filter);                                 \
-                                                                               \
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */                          \
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */                          \
-  filter2filter1 = _mm_adds_epi8(filter, t3t4);  /* signed_char_clamp */       \
-  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);                  \
-  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);          \
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);  /* >> 3 */             \
-  filter = _mm_srai_epi16(filter, 11);  /* >> 3 */                             \
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);                    \
-                                                                               \
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                        \
-  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */                       \
-  filter = _mm_unpacklo_epi8(filter, filter);                                  \
-  filter = _mm_srai_epi16(filter, 9);  /* round */                             \
-  filter = _mm_packs_epi16(filter, filter);                                    \
-  filter = _mm_andnot_si128(hev, filter);                                      \
-                                                                               \
-  hev = _mm_unpackhi_epi64(filter2filter1, filter);                            \
-  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);                 \
-                                                                               \
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */      \
-  qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                              \
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */      \
-  ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                         \
-  qs1qs0 = _mm_xor_si128(qs1qs0, t80);  /* ^ 0x80 */                           \
-  ps1ps0 = _mm_xor_si128(ps1ps0, t80);  /* ^ 0x80 */                           \
-} while (0)
+#define FILTER_HEV_MASK                                                       \
+  do {                                                                        \
+    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
+    __m128i flat = abs_diff(q1p1, q0p0);                                      \
+    /* abs(p1 - q1), abs(p0 - q0) */                                          \
+    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
+    __m128i abs_p0q0, abs_p1q1, work;                                         \
+                                                                              \
+    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
+    hev =                                                                     \
+        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
+    hev = _mm_packs_epi16(hev, hev);                                          \
+                                                                              \
+    /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
+    /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
+    abs_p0q0 =                                                                \
+        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
+    abs_p1q1 =                                                                \
+        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
+    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
+    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
+    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
+    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
+    /* abs(p3 - p2), abs(p2 - p1) */                                          \
+    work = abs_diff(p3p2, p2p1);                                              \
+    flat = _mm_max_epu8(work, flat);                                          \
+    /* abs(q3 - q2), abs(q2 - q1) */                                          \
+    work = abs_diff(q3q2, q2q1);                                              \
+    flat = _mm_max_epu8(work, flat);                                          \
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
+    mask = _mm_unpacklo_epi64(mask, flat);                                    \
+    mask = _mm_subs_epu8(mask, limit);                                        \
+    mask = _mm_cmpeq_epi8(mask, zero);                                        \
+    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
+  } while (0)
+
+#define FILTER4                                                             \
+  do {                                                                      \
+    const __m128i t3t4 =                                                    \
+        _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
+    const __m128i t80 = _mm_set1_epi8(0x80);                                \
+    __m128i filter, filter2filter1, work;                                   \
+                                                                            \
+    ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
+    qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
+                                                                            \
+    /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
+    work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
+    filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
+    /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
+    filter = _mm_subs_epi8(filter, work);                                   \
+    filter = _mm_subs_epi8(filter, work);                                   \
+    filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
+    filter = _mm_and_si128(filter, mask); /* & mask */                      \
+    filter = _mm_unpacklo_epi64(filter, filter);                            \
+                                                                            \
+    /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
+    /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
+    filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
+    filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
+    filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
+    filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
+    filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
+    filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
+                                                                            \
+    /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
+    filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
+    filter = _mm_unpacklo_epi8(filter, filter);                             \
+    filter = _mm_srai_epi16(filter, 9); /* round */                         \
+    filter = _mm_packs_epi16(filter, filter);                               \
+    filter = _mm_andnot_si128(hev, filter);                                 \
+                                                                            \
+    hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
+    filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
+                                                                            \
+    /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
+    qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
+    /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
+    ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
+    qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
+    ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
+  } while (0)
 
 void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
                                const uint8_t *_blimit, const uint8_t *_limit,
@@ -128,8 +133,8 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
   FILTER4;
 
   _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);  // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);  // *oq0
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
   _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
 }
 
@@ -238,27 +243,27 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
   __m128i abs_p1p0;
 
   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
-                                       (__m64 *)(s + 4 * p)));
+  q4p4 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
-                                       (__m64 *)(s + 3 * p)));
+  q3p3 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
-                                       (__m64 *)(s + 2 * p)));
+  q2p2 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
-                                       (__m64 *)(s + 1 * p)));
+  q1p1 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
-                                       (__m64 *)(s - 0 * p)));
+  q0p0 = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
     abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     fe = _mm_set1_epi8(0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     abs_p0q0 = abs_diff(q0p0, p0q0);
@@ -267,7 +272,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -276,8 +281,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
 
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
-                        abs_diff(q3p3, q2p2));
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, limit);
@@ -339,17 +343,17 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
       flat = _mm_and_si128(flat, mask);
 
       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
-                                           (__m64 *)(s + 5 * p)));
+      q5p5 = _mm_castps_si128(
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
 
       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
-                                           (__m64 *)(s + 6 * p)));
+      q6p6 = _mm_castps_si128(
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
 
       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
-                                           (__m64 *)(s + 7 * p)));
+      q7p7 = _mm_castps_si128(
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
@@ -369,7 +373,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
 
-      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
@@ -392,24 +396,23 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
                                     _mm_add_epi16(q4_16, q3_16));
 
       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
 
       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
-                                                         pixelFilter_q));
-      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
-                                           _mm_add_epi16(pixetFilter_p2p1p0,
-                                                         pixetFilter_q2q1q0));
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                           _mm_add_epi16(p7_16, p0_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                                           _mm_add_epi16(q7_16, q0_16)), 4);
+      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+      pixelFilter_p =
+          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+      pixetFilter_p2p1p0 = _mm_add_epi16(
+          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(p3_16, p0_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(q3_16, q0_16)), 3);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
 
       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
 
@@ -420,18 +423,18 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
 
       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p1_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q1_16)), 4);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
 
       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                             _mm_add_epi16(sum_p3, p1_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                             _mm_add_epi16(sum_q3, q1_16)), 3);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
 
       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
@@ -441,59 +444,59 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
 
       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p2_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q2_16)), 4);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
 
       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
 
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
-                                           _mm_add_epi16(sum_p3, p2_16)), 3);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
-                                           _mm_add_epi16(sum_q3, q2_16)), 3);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
 
       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p3_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q3_16)), 4);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
 
       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p4_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q4_16)), 4);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
 
       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p5_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q5_16)), 4);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
 
       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
-                             _mm_add_epi16(sum_p7, p6_16)), 4);
-      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
-                             _mm_add_epi16(sum_q7, q6_16)), 4);
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
     }
     // wide flat
@@ -554,7 +557,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
     _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
+    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
   }
 }
 
@@ -572,8 +575,8 @@ static INLINE __m128i filter8_mask(const __m128i *const flat,
                                    const __m128i *const other_filt,
                                    const __m128i *const f8_lo,
                                    const __m128i *const f8_hi) {
-  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
-                                      _mm_srli_epi16(*f8_hi, 3));
+  const __m128i f8 =
+      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
   const __m128i result = _mm_and_si128(*flat, f8);
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
@@ -582,8 +585,8 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
                                     const __m128i *const other_filt,
                                     const __m128i *const f_lo,
                                     const __m128i *const f_hi) {
-  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
-                                     _mm_srli_epi16(*f_hi, 4));
+  const __m128i f =
+      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
   const __m128i result = _mm_and_si128(*flat, f);
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
@@ -633,7 +636,7 @@ void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
     __m128i work;
     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
 
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -832,16 +835,16 @@ void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
       __m128i f_hi;
 
       f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
-      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
-                           _mm_add_epi16(p4_lo, f_lo));
+      f_lo =
+          _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
       f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
                            _mm_add_epi16(p2_lo, p1_lo));
       f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
       f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
 
       f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
-      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
-                           _mm_add_epi16(p4_hi, f_hi));
+      f_hi =
+          _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
       f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
                            _mm_add_epi16(p2_hi, p1_hi));
       f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
@@ -956,7 +959,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
     abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
 
     abs_p0q0 = abs_diff(q0p0, p0q0);
     abs_p1q1 = abs_diff(q1p1, p1q1);
@@ -964,7 +967,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -973,8 +976,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
 
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
-                        abs_diff(q3p3, q2p2));
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, limit);
@@ -982,8 +984,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
 
     // flat_mask4
 
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
-                        abs_diff(q3p3, q0p0));
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     flat = _mm_max_epu8(abs_p1p0, flat);
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
@@ -1048,14 +1049,14 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     const __m128i t3 = _mm_set1_epi8(3);
     const __m128i t80 = _mm_set1_epi8(0x80);
     const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
-                                      t80);
+    const __m128i ps1 =
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+    const __m128i ps0 =
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+    const __m128i qs0 =
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+    const __m128i qs1 =
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1134,8 +1135,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
   }
 }
 
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
-                                    const uint8_t *_blimit0,
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
                                     const uint8_t *_limit0,
                                     const uint8_t *_thresh0,
                                     const uint8_t *_blimit1,
@@ -1170,17 +1170,17 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
     const __m128i one = _mm_set1_epi8(1);
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
     __m128i work;
 
     // filter_mask and hev_mask
@@ -1188,7 +1188,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -1196,29 +1196,25 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
     mask = _mm_max_epu8(flat, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
     mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
 
     // flat_mask4
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
     flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
     flat = _mm_max_epu8(work, flat);
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
@@ -1289,14 +1285,14 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
+    const __m128i ps1 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+    const __m128i ps0 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+    const __m128i qs0 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+    const __m128i qs1 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1412,23 +1408,23 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
 
   // filter_mask and hev_mask
   {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
     __m128i work;
 
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
-    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
@@ -1436,15 +1432,13 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
     mask = _mm_max_epu8(flat, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
     mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
@@ -1460,14 +1454,14 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
-    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                      t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                      t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                      t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                      t80);
+    const __m128i ps1 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+    const __m128i ps0 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+    const __m128i qs0 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+    const __m128i qs1 =
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1525,44 +1519,44 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
   // 2-way interleave w/hoisting of unpacks
-  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
+  x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
-  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
+  x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
 
   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
-  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
+  x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
 
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
-  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
+  x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
 
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
-  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
-  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
+  x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
+  x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
 
-  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
+  x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
-  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
-  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
+  x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
+  x5 = _mm_unpacklo_epi16(x2, x3);                // 10
 
   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
-  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
+  x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
 
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
-  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
-  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
+  x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
+  x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
 
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
-  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
-  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
+  x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
+  x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
 
-  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
-  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
+  x6 = _mm_unpacklo_epi32(x4, x5);     // 13
+  x7 = _mm_unpackhi_epi32(x4, x5);     // 14
   x14 = _mm_unpacklo_epi32(x12, x13);  // 15
   x15 = _mm_unpackhi_epi32(x12, x13);  // 16
 
@@ -1598,23 +1592,31 @@ static INLINE void transpose(unsigned char *src[], int in_p,
     unsigned char *in = src[idx8x8];
     unsigned char *out = dst[idx8x8];
 
-    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
-    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
+    x0 =
+        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
+    x1 =
+        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     x0 = _mm_unpacklo_epi8(x0, x1);
 
-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
+    x2 =
+        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
+    x3 =
+        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     x1 = _mm_unpacklo_epi8(x2, x3);
 
-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
+    x4 =
+        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
+    x5 =
+        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     x2 = _mm_unpacklo_epi8(x4, x5);
 
-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
+    x6 =
+        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
+    x7 =
+        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     x3 = _mm_unpacklo_epi8(x6, x7);
 
@@ -1624,15 +1626,15 @@ static INLINE void transpose(unsigned char *src[], int in_p,
     x5 = _mm_unpacklo_epi16(x2, x3);
     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0*out_p),
+    _mm_storel_pd((double *)(out + 0 * out_p),
                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1*out_p),
+    _mm_storeh_pd((double *)(out + 1 * out_p),
                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
     x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2*out_p),
+    _mm_storel_pd((double *)(out + 2 * out_p),
                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3*out_p),
+    _mm_storeh_pd((double *)(out + 3 * out_p),
                   _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
 
     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
@@ -1641,25 +1643,23 @@ static INLINE void transpose(unsigned char *src[], int in_p,
     x5 = _mm_unpackhi_epi16(x2, x3);
     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
     x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4*out_p),
+    _mm_storel_pd((double *)(out + 4 * out_p),
                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5*out_p),
+    _mm_storeh_pd((double *)(out + 5 * out_p),
                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
     x7 = _mm_unpackhi_epi32(x4, x5);
 
-    _mm_storel_pd((double *)(out + 6*out_p),
+    _mm_storel_pd((double *)(out + 6 * out_p),
                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7*out_p),
+    _mm_storeh_pd((double *)(out + 7 * out_p),
                   _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
 void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   unsigned char *src[2];
@@ -1705,10 +1705,8 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
 }
 
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   unsigned char *src[2];
diff --git a/vpx_dsp/x86/masked_sad_intrin_ssse3.c b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
index 8b9ff1099d30c77bedf53d0ff982259ad1ddb08f..e07ff5f4df2dce826264b490f18ecef1a06dd222 100644
--- a/vpx_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
@@ -17,17 +17,17 @@
 #include "vpx/vpx_integer.h"
 
 static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
-  __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
-  __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
+  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
   return _mm_unpacklo_epi64(temp1, temp2);
 }
 
 static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
-  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr);
-  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride));
+  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
+  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
   __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
-  temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2));
-  temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3));
+  temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
+  temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
   temp1 = _mm_unpacklo_epi32(temp1, temp2);
   return _mm_unpacklo_epi64(temp3, temp1);
 }
@@ -37,32 +37,21 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
                                             const uint8_t *m_ptr, int m_stride,
                                             int width, int height);
 
-static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
-                                               int a_stride,
-                                               const uint8_t *b_ptr,
-                                               int b_stride,
-                                               const uint8_t *m_ptr,
-                                               int m_stride,
-                                               int height);
-
-static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
-                                               int a_stride,
-                                               const uint8_t *b_ptr,
-                                               int b_stride,
-                                               const uint8_t *m_ptr,
-                                               int m_stride,
-                                               int height);
-
-#define MASKSADMXN_SSSE3(m, n) \
-unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
-                                             int src_stride, \
-                                             const uint8_t *ref, \
-                                             int ref_stride, \
-                                             const uint8_t *msk, \
-                                             int msk_stride) { \
-  return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
-                          m, n); \
-}
+static INLINE unsigned int masked_sad8xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+static INLINE unsigned int masked_sad4xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+#define MASKSADMXN_SSSE3(m, n)                                                 \
+  unsigned int vpx_masked_sad##m##x##n##_ssse3(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *msk, int msk_stride) {                                    \
+    return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
+                            m, n);                                             \
+  }
 
 #if CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(128, 128)
@@ -78,28 +67,25 @@ MASKSADMXN_SSSE3(16, 32)
 MASKSADMXN_SSSE3(16, 16)
 MASKSADMXN_SSSE3(16, 8)
 
-#define MASKSAD8XN_SSSE3(n) \
-unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \
-                                         int src_stride, \
-                                         const uint8_t *ref, \
-                                         int ref_stride, \
-                                         const uint8_t *msk, \
-                                         int msk_stride) { \
-  return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
-                             msk_stride, n); \
-}
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  unsigned int vpx_masked_sad8x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
+                               msk_stride, n);                                \
+  }
 
 MASKSAD8XN_SSSE3(16)
 MASKSAD8XN_SSSE3(8)
 MASKSAD8XN_SSSE3(4)
 
-#define MASKSAD4XN_SSSE3(n) \
-unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         const uint8_t *msk, int msk_stride) { \
-  return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
-                             msk_stride, n); \
-}
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  unsigned int vpx_masked_sad4x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
+                               msk_stride, n);                                \
+  }
 
 MASKSAD4XN_SSSE3(8)
 MASKSAD4XN_SSSE3(4)
@@ -119,9 +105,9 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
     // Covering the full width
     for (x = 0; x < width; x += 16) {
       // Load a, b, m in xmm registers
-      a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
-      b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
-      m = _mm_loadu_si128((const __m128i*)(m_ptr + x));
+      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+      m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
 
       // Calculate the difference between a & b
       temp1 = _mm_subs_epu8(a, b);
@@ -144,13 +130,9 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }
 
-static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
-                                               int a_stride,
-                                               const uint8_t *b_ptr,
-                                               int b_stride,
-                                               const uint8_t *m_ptr,
-                                               int m_stride,
-                                               int height) {
+static INLINE unsigned int masked_sad8xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
   int y;
   __m128i a, b, m, temp1, temp2, row_res;
   __m128i res = _mm_setzero_si128();
@@ -184,13 +166,9 @@ static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }
 
-static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
-                                               int a_stride,
-                                               const uint8_t *b_ptr,
-                                               int b_stride,
-                                               const uint8_t *m_ptr,
-                                               int m_stride,
-                                               int height) {
+static INLINE unsigned int masked_sad4xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
   int y;
   __m128i a, b, m, temp1, temp2, row_res;
   __m128i res = _mm_setzero_si128();
@@ -228,37 +206,26 @@ static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
                                                int stride) {
-  __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
-  __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
+  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
   return _mm_unpacklo_epi64(temp1, temp2);
 }
 
-static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
-                                                   int a_stride,
-                                                   const uint8_t *b8_ptr,
-                                                   int b_stride,
-                                                   const uint8_t *m_ptr,
-                                                   int m_stride,
-                                                   int width, int height);
-
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
-                                                      int a_stride,
-                                                      const uint8_t *b8_ptr,
-                                                      int b_stride,
-                                                      const uint8_t *m_ptr,
-                                                      int m_stride,
-                                                      int height);
-
-#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
-unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
-                                                    int src_stride, \
-                                                    const uint8_t *ref, \
-                                                    int ref_stride, \
-                                                    const uint8_t *msk, \
-                                                    int msk_stride) { \
-  return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
-                                 msk_stride, m, n); \
-}
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int width, int height);
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
+  unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk,     \
+                                   msk_stride, m, n);                         \
+  }
 
 #if CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(128, 128)
@@ -277,29 +244,22 @@ HIGHBD_MASKSADMXN_SSSE3(8, 16)
 HIGHBD_MASKSADMXN_SSSE3(8, 8)
 HIGHBD_MASKSADMXN_SSSE3(8, 4)
 
-#define HIGHBD_MASKSAD4XN_SSSE3(n) \
-unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \
-                                                int src_stride, \
-                                                const uint8_t *ref, \
-                                                int ref_stride, \
-                                                const uint8_t *msk, \
-                                                int msk_stride) { \
-  return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
-                                    msk_stride, n); \
-}
+#define HIGHBD_MASKSAD4XN_SSSE3(n)                                            \
+  unsigned int vpx_highbd_masked_sad4x##n##_ssse3(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,  \
+                                      msk_stride, n);                         \
+  }
 
 HIGHBD_MASKSAD4XN_SSSE3(8)
 HIGHBD_MASKSAD4XN_SSSE3(4)
 
 // For width a multiple of 8
 // Assumes values in m are <=64
-static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
-                                                   int a_stride,
-                                                   const uint8_t *b8_ptr,
-                                                   int b_stride,
-                                                   const uint8_t *m_ptr,
-                                                   int m_stride,
-                                                   int width, int height) {
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int width, int height) {
   int y, x;
   __m128i a, b, m, temp1, temp2;
   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
@@ -310,9 +270,9 @@ static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
     // Covering the full width
     for (x = 0; x < width; x += 8) {
       // Load a, b, m in xmm registers
-      a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
-      b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
-      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)),
+      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
                             _mm_setzero_si128());
 
       // Calculate the difference between a & b
@@ -334,13 +294,9 @@ static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
   return (_mm_cvtsi128_si32(res) + 31) >> 6;
 }
 
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
-                                                      int a_stride,
-                                                      const uint8_t *b8_ptr,
-                                                      int b_stride,
-                                                      const uint8_t *m_ptr,
-                                                      int m_stride,
-                                                      int height) {
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
   int y;
   __m128i a, b, m, temp1, temp2;
   const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
@@ -351,8 +307,8 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
     // Load a, b, m in xmm registers
     a = highbd_width4_load_2rows(a_ptr, a_stride);
     b = highbd_width4_load_2rows(b_ptr, b_stride);
-    temp1 = _mm_loadl_epi64((const __m128i*)m_ptr);
-    temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride));
+    temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
+    temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
     m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
                           _mm_setzero_si128());
 
diff --git a/vpx_dsp/x86/masked_variance_intrin_ssse3.c b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
index a0c2b6e853ab40589068befdd347815fe022adeb..ae08422746dd0e998bac56f8913b2268ec7b3d57 100644
--- a/vpx_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
@@ -18,9 +18,8 @@
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_filter.h"
 
-
 // Half pixel shift
-#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS/2)
+#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2)
 
 /*****************************************************************************
  * Horizontal additions
@@ -39,7 +38,7 @@ static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
 #else
   {
     int64_t tmp;
-    _mm_storel_epi64((__m128i*)&tmp, v_q);
+    _mm_storel_epi64((__m128i *)&tmp, v_q);
     return tmp;
   }
 #endif
@@ -47,7 +46,7 @@ static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
-  const __m128i v_sign_d =  _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
   const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
   const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
   return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
@@ -55,8 +54,8 @@ static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
-                                            uint32_t* sse,
-                                            const int w, const int h) {
+                                            uint32_t *sse, const int w,
+                                            const int h) {
   int64_t sum64;
   uint64_t sse64;
 
@@ -73,7 +72,7 @@ static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
   // Store the SSE
   *sse = (uint32_t)sse64;
   // Compute the variance
-  return  *sse - (uint32_t)((sum64 * sum64) / (w * h));
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
 }
 
 /*****************************************************************************
@@ -81,11 +80,8 @@ static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
  *****************************************************************************/
 
 static INLINE unsigned int masked_variancewxh_ssse3(
-    const uint8_t *a, int  a_stride,
-    const uint8_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int w, int  h,
-    unsigned int *sse) {
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
   int ii, jj;
 
   const __m128i v_zero = _mm_setzero_si128();
@@ -96,11 +92,11 @@ static INLINE unsigned int masked_variancewxh_ssse3(
   assert((w % 16) == 0);
 
   for (ii = 0; ii < h; ii++) {
-    for (jj = 0 ; jj < w ; jj += 16) {
+    for (jj = 0; jj < w; jj += 16) {
       // Load inputs - 8 bits
-      const __m128i v_a_b = _mm_loadu_si128((const __m128i*)(a+jj));
-      const __m128i v_b_b = _mm_loadu_si128((const __m128i*)(b+jj));
-      const __m128i v_m_b = _mm_loadu_si128((const __m128i*)(m+jj));
+      const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj));
+      const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj));
+      const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj));
 
       // Unpack to 16 bits - still containing max 8 bits
       const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
@@ -147,17 +143,13 @@ static INLINE unsigned int masked_variancewxh_ssse3(
   return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
 }
 
-#define MASKED_VARWXH(W, H)                                               \
-unsigned int vpx_masked_variance##W##x##H##_ssse3(                        \
-  const uint8_t *a, int a_stride,                                         \
-  const uint8_t *b, int b_stride,                                         \
-  const uint8_t *m, int m_stride,                                         \
-  unsigned int *sse) {                                                    \
-  return masked_variancewxh_ssse3(a, a_stride,                            \
-                                  b, b_stride,                            \
-                                  m, m_stride,                            \
-                                  W, H, sse);                             \
-}
+#define MASKED_VARWXH(W, H)                                                   \
+  unsigned int vpx_masked_variance##W##x##H##_ssse3(                          \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
+    return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \
+                                    H, sse);                                  \
+  }
 
 MASKED_VARWXH(16, 8)
 MASKED_VARWXH(16, 16)
@@ -178,11 +170,8 @@ MASKED_VARWXH(128, 128)
  *****************************************************************************/
 
 static INLINE unsigned int masked_variance8xh_ssse3(
-    const uint8_t *a, int  a_stride,
-    const uint8_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int  h,
-    unsigned int *sse) {
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int h, unsigned int *sse) {
   int ii;
 
   const __m128i v_zero = _mm_setzero_si128();
@@ -192,9 +181,9 @@ static INLINE unsigned int masked_variance8xh_ssse3(
 
   for (ii = 0; ii < h; ii++) {
     // Load inputs - 8 bits
-    const __m128i v_a_b = _mm_loadl_epi64((const __m128i*)a);
-    const __m128i v_b_b = _mm_loadl_epi64((const __m128i*)b);
-    const __m128i v_m_b = _mm_loadl_epi64((const __m128i*)m);
+    const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a);
+    const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b);
+    const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m);
 
     // Unpack to 16 bits - still containing max 8 bits
     const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
@@ -229,17 +218,13 @@ static INLINE unsigned int masked_variance8xh_ssse3(
   return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
 }
 
-#define MASKED_VAR8XH(H)                                                  \
-unsigned int vpx_masked_variance8x##H##_ssse3(                            \
-  const uint8_t *a, int a_stride,                                         \
-  const uint8_t *b, int b_stride,                                         \
-  const uint8_t *m, int m_stride,                                         \
-  unsigned int *sse) {                                                    \
-  return masked_variance8xh_ssse3(a, a_stride,                            \
-                                  b, b_stride,                            \
-                                  m, m_stride,                            \
-                                  H, sse);                                \
-}
+#define MASKED_VAR8XH(H)                                                      \
+  unsigned int vpx_masked_variance8x##H##_ssse3(                              \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
+    return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
+                                    sse);                                     \
+  }
 
 MASKED_VAR8XH(4)
 MASKED_VAR8XH(8)
@@ -250,11 +235,8 @@ MASKED_VAR8XH(16)
  *****************************************************************************/
 
 static INLINE unsigned int masked_variance4xh_ssse3(
-    const uint8_t *a, int  a_stride,
-    const uint8_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int  h,
-    unsigned int *sse) {
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int h, unsigned int *sse) {
   int ii;
 
   const __m128i v_zero = _mm_setzero_si128();
@@ -264,14 +246,14 @@ static INLINE unsigned int masked_variance4xh_ssse3(
 
   assert((h % 2) == 0);
 
-  for (ii = 0; ii < h/2; ii++) {
+  for (ii = 0; ii < h / 2; ii++) {
     // Load 2 input rows - 8 bits
-    const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t*)a);
-    const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t*)b);
-    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t*)m);
-    const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t*)(a + a_stride));
-    const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t*)(b + b_stride));
-    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t*)(m + m_stride));
+    const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a);
+    const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b);
+    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
+    const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride));
+    const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride));
+    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
 
     // Interleave 2 rows into a single register
     const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b);
@@ -311,17 +293,13 @@ static INLINE unsigned int masked_variance4xh_ssse3(
   return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
 }
 
-#define MASKED_VAR4XH(H)                                                  \
-unsigned int vpx_masked_variance4x##H##_ssse3(                            \
-  const uint8_t *a, int a_stride,                                         \
-  const uint8_t *b, int b_stride,                                         \
-  const uint8_t *m, int m_stride,                                         \
-  unsigned int *sse) {                                                    \
-  return masked_variance4xh_ssse3(a, a_stride,                            \
-                                  b, b_stride,                            \
-                                  m, m_stride,                            \
-                                  H, sse);                                \
-}
+#define MASKED_VAR4XH(H)                                                      \
+  unsigned int vpx_masked_variance4x##H##_ssse3(                              \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
+    return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
+                                    sse);                                     \
+  }
 
 MASKED_VAR4XH(4)
 MASKED_VAR4XH(8)
@@ -330,11 +308,8 @@ MASKED_VAR4XH(8)
 
 // Main calculation for n*8 wide blocks
 static INLINE void highbd_masked_variance64_ssse3(
-    const uint16_t *a, int  a_stride,
-    const uint16_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int w, int  h,
-    int64_t *sum, uint64_t *sse) {
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) {
   int ii, jj;
 
   const __m128i v_zero = _mm_setzero_si128();
@@ -345,11 +320,11 @@ static INLINE void highbd_masked_variance64_ssse3(
   assert((w % 8) == 0);
 
   for (ii = 0; ii < h; ii++) {
-    for (jj = 0 ; jj < w ; jj += 8) {
+    for (jj = 0; jj < w; jj += 8) {
       // Load inputs - 8 bits
-      const __m128i v_a_w = _mm_loadu_si128((const __m128i*)(a+jj));
-      const __m128i v_b_w = _mm_loadu_si128((const __m128i*)(b+jj));
-      const __m128i v_m_b = _mm_loadl_epi64((const __m128i*)(m+jj));
+      const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj));
+      const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj));
+      const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj));
 
       // Unpack m to 16 bits - still containing max 8 bits
       const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
@@ -396,18 +371,15 @@ static INLINE void highbd_masked_variance64_ssse3(
   *sse = hsum_epi64_si64(v_sse_q);
 
   // Round
-  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = (*sum >= 0) ? *sum : -*sum;
   *sum = ROUND_POWER_OF_TWO(*sum, 6);
   *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 // Main calculation for 4 wide blocks
 static INLINE void highbd_masked_variance64_4wide_ssse3(
-    const uint16_t *a, int  a_stride,
-    const uint16_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int  h,
-    int64_t *sum, uint64_t *sse) {
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) {
   int ii;
 
   const __m128i v_zero = _mm_setzero_si128();
@@ -417,14 +389,14 @@ static INLINE void highbd_masked_variance64_4wide_ssse3(
 
   assert((h % 2) == 0);
 
-  for (ii = 0; ii < h/2; ii++) {
+  for (ii = 0; ii < h / 2; ii++) {
     // Load 2 input rows - 8 bits
-    const __m128i v_a0_w = _mm_loadl_epi64((const __m128i*)a);
-    const __m128i v_b0_w = _mm_loadl_epi64((const __m128i*)b);
-    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t*)m);
-    const __m128i v_a1_w = _mm_loadl_epi64((const __m128i*)(a + a_stride));
-    const __m128i v_b1_w = _mm_loadl_epi64((const __m128i*)(b + b_stride));
-    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t*)(m + m_stride));
+    const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a);
+    const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b);
+    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
+    const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+    const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
 
     // Interleave 2 rows into a single register
     const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w);
@@ -475,26 +447,23 @@ static INLINE void highbd_masked_variance64_4wide_ssse3(
   *sse = hsum_epi64_si64(v_sse_q);
 
   // Round
-  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = (*sum >= 0) ? *sum : -*sum;
   *sum = ROUND_POWER_OF_TWO(*sum, 6);
   *sse = ROUND_POWER_OF_TWO(*sse, 12);
 }
 
 static INLINE unsigned int highbd_masked_variancewxh_ssse3(
-    const uint16_t *a, int  a_stride,
-    const uint16_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int w, int  h,
-    unsigned int *sse) {
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
   uint64_t sse64;
   int64_t sum64;
 
   if (w == 4)
-    highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &sum64, &sse64);
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+                                         h, &sum64, &sse64);
   else
-    highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &sum64, &sse64);
+    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+                                   &sum64, &sse64);
 
   // Store the SSE
   *sse = (uint32_t)sse64;
@@ -503,20 +472,17 @@ static INLINE unsigned int highbd_masked_variancewxh_ssse3(
 }
 
 static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
-    const uint16_t *a, int  a_stride,
-    const uint16_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int w, int  h,
-    unsigned int *sse) {
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
   uint64_t sse64;
   int64_t sum64;
 
   if (w == 4)
-    highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &sum64, &sse64);
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+                                         h, &sum64, &sse64);
   else
-    highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &sum64, &sse64);
+    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+                                   &sum64, &sse64);
 
   // Normalise
   sum64 = ROUND_POWER_OF_TWO(sum64, 2);
@@ -529,20 +495,17 @@ static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
 }
 
 static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
-    const uint16_t *a, int  a_stride,
-    const uint16_t *b, int  b_stride,
-    const uint8_t *m, int  m_stride,
-    int w, int  h,
-    unsigned int *sse) {
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
   uint64_t sse64;
   int64_t sum64;
 
   if (w == 4)
-    highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
-            h, &sum64, &sse64);
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+                                         h, &sum64, &sse64);
   else
-    highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
-            &sum64, &sse64);
+    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+                                   &sum64, &sse64);
 
   sum64 = ROUND_POWER_OF_TWO(sum64, 4);
   sse64 = ROUND_POWER_OF_TWO(sse64, 8);
@@ -553,45 +516,33 @@ static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
   return *sse - (uint32_t)((sum64 * sum64) / (w * h));
 }
 
-#define HIGHBD_MASKED_VARWXH(W, H)                                             \
-unsigned int vpx_highbd_masked_variance##W##x##H##_ssse3(                      \
-  const uint8_t *a8, int a_stride,                                             \
-  const uint8_t *b8, int b_stride,                                             \
-  const uint8_t *m, int m_stride,                                              \
-  unsigned int *sse) {                                                         \
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                       \
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                       \
-  return highbd_masked_variancewxh_ssse3(a, a_stride,                          \
-                                         b, b_stride,                          \
-                                         m, m_stride,                          \
-                                         W, H, sse);                           \
-}                                                                              \
-                                                                               \
-unsigned int vpx_highbd_10_masked_variance##W##x##H##_ssse3(                   \
-  const uint8_t *a8, int a_stride,                                             \
-  const uint8_t *b8, int b_stride,                                             \
-  const uint8_t *m, int m_stride,                                              \
-  unsigned int *sse) {                                                         \
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                       \
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                       \
-  return highbd_10_masked_variancewxh_ssse3(a, a_stride,                       \
-                                            b, b_stride,                       \
-                                            m, m_stride,                       \
-                                            W, H, sse);                        \
-}                                                                              \
-                                                                               \
-unsigned int vpx_highbd_12_masked_variance##W##x##H##_ssse3(                   \
-  const uint8_t *a8, int a_stride,                                             \
-  const uint8_t *b8, int b_stride,                                             \
-  const uint8_t *m, int m_stride,                                              \
-  unsigned int *sse) {                                                         \
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                       \
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                       \
-  return highbd_12_masked_variancewxh_ssse3(a, a_stride,                       \
-                                            b, b_stride,                       \
-                                            m, m_stride,                       \
-                                            W, H, sse);                        \
-}
+#define HIGHBD_MASKED_VARWXH(W, H)                                         \
+  unsigned int vpx_highbd_masked_variance##W##x##H##_ssse3(                \
+      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
+    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
+    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
+    return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m,    \
+                                           m_stride, W, H, sse);           \
+  }                                                                        \
+                                                                           \
+  unsigned int vpx_highbd_10_masked_variance##W##x##H##_ssse3(             \
+      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
+    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
+    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
+    return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
+                                              m_stride, W, H, sse);        \
+  }                                                                        \
+                                                                           \
+  unsigned int vpx_highbd_12_masked_variance##W##x##H##_ssse3(             \
+      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
+    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
+    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
+    return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
+                                              m_stride, W, H, sse);        \
+  }
 
 HIGHBD_MASKED_VARWXH(4, 4)
 HIGHBD_MASKED_VARWXH(4, 8)
@@ -619,11 +570,11 @@ HIGHBD_MASKED_VARWXH(128, 128)
 //////////////////////////////////////////////////////////////////////////////
 
 typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
-                                    __m128i v_filter_b);
+                               __m128i v_filter_b);
 
 static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
                                        const __m128i v_filter_b) {
-  (void) v_filter_b;
+  (void)v_filter_b;
   return _mm_avg_epu8(v_a_b, v_b_b);
 }
 
@@ -634,28 +585,27 @@ static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b,
   __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b);
   __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b);
   __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b);
-  __m128i v_res_lo_w = _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w),
-                                      FILTER_BITS);
-  __m128i v_res_hi_w = _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w),
-                                      FILTER_BITS);
+  __m128i v_res_lo_w =
+      _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
+  __m128i v_res_hi_w =
+      _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS);
   return _mm_packus_epi16(v_res_lo_w, v_res_hi_w);
 }
 
 // Apply the filter to the contents of the lower half of a and b
 static INLINE void apply_filter_lo(const __m128i v_a_lo_b,
                                    const __m128i v_b_lo_b,
-                                   const __m128i v_filter_b,
-                                   __m128i* v_res_w) {
+                                   const __m128i v_filter_b, __m128i *v_res_w) {
   const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
   __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b);
   __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b);
-  *v_res_w = _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w),
-                              FILTER_BITS);
+  *v_res_w =
+      _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
 }
 
 static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b,
-                        const __m128i v_m_b, __m128i* v_sum_d,
-                        __m128i* v_sse_q) {
+                        const __m128i v_m_b, __m128i *v_sum_d,
+                        __m128i *v_sse_q) {
   const __m128i v_zero = _mm_setzero_si128();
   // Unpack to 16 bits - still containing max 8 bits
   const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
@@ -694,37 +644,38 @@ static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b,
 }
 
 // Functions for width (W) >= 16
-unsigned int vpx_masked_subpel_varWxH_xzero(
-        const uint8_t *src, int src_stride, int yoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int w, int h, filter_fn_t filter_fn) {
+unsigned int vpx_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride,
+                                            int yoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int w, int h,
+                                            filter_fn_t filter_fn) {
   int i, j;
   __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_b = _mm_set1_epi16((
-        bilinear_filters_2t[yoffset][1] << 8) +
-        bilinear_filters_2t[yoffset][0]);
+  const __m128i v_filter_b = _mm_set1_epi16(
+      (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 16) {
     // Load the first row ready
-    v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
     // Process 2 rows at a time
     for (i = 0; i < h; i += 2) {
       // Load the next row apply the filter
-      v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + src_stride));
+      v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
       v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
       // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j));
-      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j));
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
       sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
 
       // Load the next row apply the filter
-      v_src0_b = _mm_loadu_si128((const __m128i*)(src + j + src_stride * 2));
+      v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
       v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b);
       // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j + dst_stride));
-      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j + msk_stride));
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride));
       sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
       // Move onto the next block of rows
       src += src_stride * 2;
@@ -738,28 +689,29 @@ unsigned int vpx_masked_subpel_varWxH_xzero(
   }
   return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
 }
-unsigned int vpx_masked_subpel_varWxH_yzero(
-        const uint8_t *src, int src_stride, int xoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int w, int h, filter_fn_t filter_fn) {
+unsigned int vpx_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride,
+                                            int xoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int w, int h,
+                                            filter_fn_t filter_fn) {
   int i, j;
   __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_b = _mm_set1_epi16((
-        bilinear_filters_2t[xoffset][1] << 8) +
-        bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filter_b = _mm_set1_epi16(
+      (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j += 16) {
       // Load this row and one below & apply the filter to them
-      v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
-      v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + 1));
+      v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+      v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
       v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
 
       // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j));
-      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j));
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
       sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
     }
     src += src_stride;
@@ -769,49 +721,47 @@ unsigned int vpx_masked_subpel_varWxH_yzero(
   return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
 }
 unsigned int vpx_masked_subpel_varWxH_xnonzero_ynonzero(
-        const uint8_t *src, int src_stride, int xoffset, int yoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int w, int h, filter_fn_t xfilter_fn,
-        filter_fn_t yfilter_fn) {
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int w, int h, filter_fn_t xfilter_fn,
+    filter_fn_t yfilter_fn) {
   int i, j;
   __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b;
   __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filterx_b = _mm_set1_epi16((
-        bilinear_filters_2t[xoffset][1] << 8) +
-        bilinear_filters_2t[xoffset][0]);
-  const __m128i v_filtery_b = _mm_set1_epi16((
-        bilinear_filters_2t[yoffset][1] << 8) +
-        bilinear_filters_2t[yoffset][0]);
+  const __m128i v_filterx_b = _mm_set1_epi16(
+      (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filtery_b = _mm_set1_epi16(
+      (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 16) {
     // Load the first row ready
-    v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
-    v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + 1));
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
     v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
     // Process 2 rows at a time
     for (i = 0; i < h; i += 2) {
       // Load the next row & apply the filter
-      v_src2_b = _mm_loadu_si128((const __m128i*)(src + src_stride + j));
-      v_src3_b = _mm_loadu_si128((const __m128i*)(src + src_stride + j + 1));
+      v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
+      v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
       v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b);
       // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j));
-      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j));
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
       // Complete the calculation for this row and add it to the running total
       v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b);
       sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
 
       // Load the next row & apply the filter
-      v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + j));
-      v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 +
-                                                  j + 1));
+      v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
+      v_src1_b =
+          _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
       v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
       // Load the dst and msk for the variance calculation
-      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + dst_stride + j));
-      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + msk_stride + j));
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j));
       // Complete the calculation for this row and add it to the running total
       v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b);
       sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
@@ -830,58 +780,61 @@ unsigned int vpx_masked_subpel_varWxH_xnonzero_ynonzero(
 
 // Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
 // xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int vpx_masked_subpel_var4xH_xzero(
-        const uint8_t *src, int src_stride, int  yoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h) {
+unsigned int vpx_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride,
+                                            int yoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
   int i;
   __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w;
   __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
   __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((
-        bilinear_filters_2t[yoffset][1] << 8) +
-        bilinear_filters_2t[yoffset][0]);
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                      bilinear_filters_2t[yoffset][0]);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
-  v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
   for (i = 0; i < h; i += 4) {
     // Load the rest of the source data for these rows
-    v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
     v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
-    v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
-    v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3));
+    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
     v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
-    v_src0_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 4));
+    v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
     // Load the dst data
-    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0));
-    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1));
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
     v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
-    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2));
-    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3));
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
     v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
     v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
     // Load the mask data
-    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0));
-    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1));
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
     v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
-    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2));
-    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3));
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
     v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
     v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
     // Apply the y filter
     if (yoffset == HALF_PIXEL_OFFSET) {
       v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
-      v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
-            _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      v_src2_b =
+          _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+                       _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
       v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b);
     } else {
-      v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
-            _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      v_src2_b =
+          _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+                       _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0)));
       apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w);
-      v_src2_b = _mm_or_si128(_mm_slli_si128(v_src3_b, 4),
-            _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      v_src2_b =
+          _mm_or_si128(_mm_slli_si128(v_src3_b, 4),
+                       _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
       apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w);
       v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w);
     }
@@ -896,49 +849,49 @@ unsigned int vpx_masked_subpel_var4xH_xzero(
 }
 
 // Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
-unsigned int vpx_masked_subpel_var8xH_xzero(
-        const uint8_t *src, int src_stride, int yoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h) {
+unsigned int vpx_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride,
+                                            int yoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
   int i;
   __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b;
   __m128i v_dst_b = _mm_setzero_si128();
   __m128i v_msk_b = _mm_setzero_si128();
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((
-        bilinear_filters_2t[yoffset][1] << 8) +
-        bilinear_filters_2t[yoffset][0]);
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                      bilinear_filters_2t[yoffset][0]);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
-  v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
   for (i = 0; i < h; i += 2) {
     if (yoffset == HALF_PIXEL_OFFSET) {
       // Load the rest of the source data for these rows
       v_src1_b = _mm_or_si128(
-            _mm_slli_si128(v_src0_b, 8),
-            _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)));
+          _mm_slli_si128(v_src0_b, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
       v_src0_b = _mm_or_si128(
-            _mm_slli_si128(v_src1_b, 8),
-            _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)));
+          _mm_slli_si128(v_src1_b, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
       // Apply the y filter
       v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b);
     } else {
       // Load the data and apply the y filter
-      v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+      v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
       apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w);
-      v_src0_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+      v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
       apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w);
       v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w);
     }
     // Load the dst data
     v_dst_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)),
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)),
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
     // Compute the sum and SSE
     sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
     // Move onto the next set of rows
@@ -951,10 +904,11 @@ unsigned int vpx_masked_subpel_var8xH_xzero(
 
 // Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
 // xmm[63:32] = row 3, xmm[31:0] = row 4
-unsigned int vpx_masked_subpel_var4xH_yzero(
-        const uint8_t *src, int src_stride, int xoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h) {
+unsigned int vpx_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride,
+                                            int xoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
   int i;
   __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
   __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
@@ -962,38 +916,37 @@ unsigned int vpx_masked_subpel_var4xH_yzero(
   __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((
-        bilinear_filters_2t[xoffset][1] << 8) +
-        bilinear_filters_2t[xoffset][0]);
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                      bilinear_filters_2t[xoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 4) {
     // Load the src data
-    v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+    v_src0_b = _mm_loadl_epi64((const __m128i *)src);
     v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
     v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-    v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
     v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
     v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
-    v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3));
+    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
     v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
     v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
     v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
     // Load the dst data
-    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0));
-    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1));
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
     v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
-    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2));
-    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3));
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
     v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
     v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
     // Load the mask data
-    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0));
-    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1));
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
     v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
-    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2));
-    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3));
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
     v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
     v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
     // Apply the x filter
@@ -1016,24 +969,24 @@ unsigned int vpx_masked_subpel_var4xH_yzero(
   return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
 }
 
-unsigned int vpx_masked_subpel_var8xH_yzero(
-        const uint8_t *src, int src_stride, int xoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h) {
+unsigned int vpx_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride,
+                                            int xoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
   int i;
   __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w;
   __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_b = _mm_set1_epi16((
-        bilinear_filters_2t[xoffset][1] << 8) +
-        bilinear_filters_2t[xoffset][0]);
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                      bilinear_filters_2t[xoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 2) {
     // Load the src data
-    v_src0_b = _mm_loadu_si128((const __m128i*)(src));
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src));
     v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
     if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1047,12 +1000,12 @@ unsigned int vpx_masked_subpel_var8xH_yzero(
     }
     // Load the dst data
     v_dst_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)),
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)),
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
     // Compute the sum and SSE
     sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
     // Move onto the next set of rows
@@ -1066,9 +1019,9 @@ unsigned int vpx_masked_subpel_var8xH_yzero(
 // Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
 // xmm[63:32] = row 3, xmm[31:0] = row 4
 unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero(
-        const uint8_t *src, int src_stride, int xoffset, int  yoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h) {
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int h) {
   int i;
   __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
   __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
@@ -1077,25 +1030,23 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero(
   __m128i v_xres_b[2];
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filterx_b = _mm_set1_epi16((
-        bilinear_filters_2t[xoffset][1] << 8) +
-        bilinear_filters_2t[xoffset][0]);
-  __m128i v_filtery_b = _mm_set1_epi16((
-        bilinear_filters_2t[yoffset][1] << 8) +
-        bilinear_filters_2t[yoffset][0]);
+  __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                       bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                       bilinear_filters_2t[yoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 4) {
     // Load the src data
-    v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+    v_src0_b = _mm_loadl_epi64((const __m128i *)src);
     v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
     v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
-    v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
     v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
     v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
-    v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3));
+    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
     v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
     v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
     v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
@@ -1113,18 +1064,17 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero(
     src += src_stride * 4;
   }
   // Load one more row to be used in the y filter
-  v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
   v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
   // Apply the x filter
   if (xoffset == HALF_PIXEL_OFFSET) {
-    v_extra_row_b = _mm_and_si128(
-            _mm_avg_epu8(v_src0_b, v_src0_shift_b),
-            _mm_setr_epi32(-1, 0, 0, 0));
+    v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b),
+                                  _mm_setr_epi32(-1, 0, 0, 0));
   } else {
     apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
-    v_extra_row_b = _mm_and_si128(
-            _mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()),
-            _mm_setr_epi32(-1, 0, 0, 0));
+    v_extra_row_b =
+        _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()),
+                      _mm_setr_epi32(-1, 0, 0, 0));
   }
 
   for (i = 0; i < h; i += 4) {
@@ -1143,19 +1093,19 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero(
     }
 
     // Load the dst data
-    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0));
-    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1));
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
     v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
-    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2));
-    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3));
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
     v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
     v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
     // Load the mask data
-    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0));
-    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1));
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
     v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
-    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2));
-    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3));
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
     v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
     v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
     // Compute the sum and SSE
@@ -1168,27 +1118,25 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero(
 }
 
 unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero(
-        const uint8_t *src, int src_stride, int xoffset, int  yoffset,
-        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h) {
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int h) {
   int i;
   __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b;
   __m128i v_src0_shift_b, v_src1_shift_b;
   __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filterx_b = _mm_set1_epi16((
-        bilinear_filters_2t[xoffset][1] << 8) +
-        bilinear_filters_2t[xoffset][0]);
-  __m128i v_filtery_b = _mm_set1_epi16((
-        bilinear_filters_2t[yoffset][1] << 8) +
-        bilinear_filters_2t[yoffset][0]);
+  __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                       bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                       bilinear_filters_2t[yoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first block of src data
-  v_src0_b = _mm_loadu_si128((const __m128i*)(src));
+  v_src0_b = _mm_loadu_si128((const __m128i *)(src));
   v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-  v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
+  v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
   v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
   // Apply the x filter
   if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1202,9 +1150,9 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero(
   }
   for (i = 0; i < h; i += 4) {
     // Load the next block of src data
-    v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2));
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
     v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
     if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1226,19 +1174,19 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero(
     }
     // Load the dst data
     v_dst_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
     // Compute the sum and SSE
     sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
 
     // Load the next block of src data
-    v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 4));
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
     v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
-    v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
     v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
     // Apply the x filter
     if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1260,12 +1208,12 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero(
     }
     // Load the dst data
     v_dst_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
     // Compute the sum and SSE
     sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
     // Move onto the next set of rows
@@ -1276,89 +1224,77 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero(
   return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
 }
 
-
 // For W >=16
 #define MASK_SUBPIX_VAR_LARGE(W, H)                                            \
-unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3(                   \
-        const uint8_t *src, int src_stride,                                    \
-        int xoffset, int  yoffset,                                             \
-        const uint8_t *dst, int dst_stride,                                    \
-        const uint8_t *msk, int msk_stride,                                    \
-        unsigned int *sse) {                                                   \
-  assert(W % 16 == 0);                                                         \
-  if (xoffset == 0) {                                                          \
-    if (yoffset == 0)                                                          \
-      return vpx_masked_variance##W##x##H##_ssse3(src, src_stride,             \
-                                                  dst, dst_stride,             \
-                                                  msk, msk_stride, sse);       \
-    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
-      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
-                                            HALF_PIXEL_OFFSET,                 \
-                                            dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter_avg);      \
-    else                                                                       \
-      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
-                                            yoffset,                           \
-                                            dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter);          \
-  } else if (yoffset == 0) {                                                   \
-    if (xoffset == HALF_PIXEL_OFFSET)                                          \
-      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
-                                            HALF_PIXEL_OFFSET,                 \
-                                            dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter_avg);      \
-    else                                                                       \
-      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
-                                            xoffset,                           \
-                                            dst, dst_stride, msk, msk_stride,  \
-                                            sse, W, H, apply_filter);          \
-  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
-    if (yoffset == HALF_PIXEL_OFFSET)                                          \
-      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, dst_stride, msk,      \
-              msk_stride, sse, W, H, apply_filter_avg, apply_filter_avg);      \
-    else                                                                       \
-      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk,                \
-              msk_stride, sse, W, H, apply_filter_avg, apply_filter);          \
-  } else {                                                                     \
-    if (yoffset == HALF_PIXEL_OFFSET)                                          \
-      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk,                \
-              msk_stride, sse, W, H, apply_filter, apply_filter_avg);          \
-    else                                                                       \
-      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
-              xoffset, yoffset, dst, dst_stride, msk,                          \
-              msk_stride, sse, W, H, apply_filter, apply_filter);              \
-  }                                                                            \
-}
+  unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
+      unsigned int *sse) {                                                     \
+    assert(W % 16 == 0);                                                       \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0)                                                        \
+        return vpx_masked_variance##W##x##H##_ssse3(                           \
+            src, src_stride, dst, dst_stride, msk, msk_stride, sse);           \
+      else if (yoffset == HALF_PIXEL_OFFSET)                                   \
+        return vpx_masked_subpel_varWxH_xzero(                                 \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, apply_filter_avg);                          \
+      else                                                                     \
+        return vpx_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst,   \
+                                              dst_stride, msk, msk_stride,     \
+                                              sse, W, H, apply_filter);        \
+    } else if (yoffset == 0) {                                                 \
+      if (xoffset == HALF_PIXEL_OFFSET)                                        \
+        return vpx_masked_subpel_varWxH_yzero(                                 \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, apply_filter_avg);                          \
+      else                                                                     \
+        return vpx_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst,   \
+                                              dst_stride, msk, msk_stride,     \
+                                              sse, W, H, apply_filter);        \
+    } else if (xoffset == HALF_PIXEL_OFFSET) {                                 \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return vpx_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst,        \
+            dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg,          \
+            apply_filter_avg);                                                 \
+      else                                                                     \
+        return vpx_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, apply_filter_avg, apply_filter);            \
+    } else {                                                                   \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return vpx_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, apply_filter, apply_filter_avg);            \
+      else                                                                     \
+        return vpx_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, xoffset, yoffset, dst, dst_stride, msk,           \
+            msk_stride, sse, W, H, apply_filter, apply_filter);                \
+    }                                                                          \
+  }
 
 // For W < 16
 #define MASK_SUBPIX_VAR_SMALL(W, H)                                            \
-unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3(                   \
-        const uint8_t *src, int src_stride,                                    \
-        int xoffset, int  yoffset,                                             \
-        const uint8_t *dst, int dst_stride,                                    \
-        const uint8_t *msk, int msk_stride,                                    \
-        unsigned int *sse) {                                                   \
-  assert(W == 4 || W == 8);                                                    \
-  if (xoffset == 0 && yoffset == 0)                                            \
-    return vpx_masked_variance##W##x##H##_ssse3(src, src_stride,               \
-                                                dst, dst_stride,               \
-                                                msk, msk_stride, sse);         \
-  else if (xoffset == 0)                                                       \
-    return vpx_masked_subpel_var##W##xH_xzero(src, src_stride, yoffset,        \
-                                              dst, dst_stride,                 \
-                                              msk, msk_stride, sse, H);        \
-  else if (yoffset == 0)                                                       \
-    return vpx_masked_subpel_var##W##xH_yzero(src, src_stride, xoffset,        \
-                                              dst, dst_stride,                 \
-                                              msk, msk_stride, sse, H);        \
-  else                                                                         \
-    return vpx_masked_subpel_var##W##xH_xnonzero_ynonzero(                     \
-          src, src_stride, xoffset, yoffset, dst, dst_stride,                  \
-          msk, msk_stride, sse, H);                                            \
-}
+  unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
+      unsigned int *sse) {                                                     \
+    assert(W == 4 || W == 8);                                                  \
+    if (xoffset == 0 && yoffset == 0)                                          \
+      return vpx_masked_variance##W##x##H##_ssse3(                             \
+          src, src_stride, dst, dst_stride, msk, msk_stride, sse);             \
+    else if (xoffset == 0)                                                     \
+      return vpx_masked_subpel_var##W##xH_xzero(                               \
+          src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \
+    else if (yoffset == 0)                                                     \
+      return vpx_masked_subpel_var##W##xH_yzero(                               \
+          src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \
+    else                                                                       \
+      return vpx_masked_subpel_var##W##xH_xnonzero_ynonzero(                   \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
+          sse, H);                                                             \
+  }
 
 MASK_SUBPIX_VAR_SMALL(4, 4)
 MASK_SUBPIX_VAR_SMALL(4, 8)
@@ -1381,20 +1317,19 @@ MASK_SUBPIX_VAR_LARGE(128, 128)
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
-                                             uint32_t *sse,
-                                             const int w, const int h);
-typedef unsigned int (*highbd_variance_fn_t)(
-                      const uint8_t *a8, int a_stride,
-                      const uint8_t *b8, int b_stride,
-                      const uint8_t *m, int m_stride,
-                      unsigned int *sse);
+                                             uint32_t *sse, const int w,
+                                             const int h);
+typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride,
+                                             const uint8_t *b8, int b_stride,
+                                             const uint8_t *m, int m_stride,
+                                             unsigned int *sse);
 typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
-                                    __m128i v_filter_w);
+                                      __m128i v_filter_w);
 
 static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
                                               const __m128i v_b_w,
                                               const __m128i v_filter_w) {
-  (void) v_filter_w;
+  (void)v_filter_w;
   return _mm_avg_epu16(v_a_w, v_b_w);
 }
 
@@ -1406,27 +1341,27 @@ static INLINE __m128i highbd_apply_filter(const __m128i v_a_w,
   __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w);
   __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w);
   __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w);
-  __m128i v_res_lo_d = _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d),
-                                      FILTER_BITS);
-  __m128i v_res_hi_d = _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d),
-                                      FILTER_BITS);
+  __m128i v_res_lo_d =
+      _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
+  __m128i v_res_hi_d =
+      _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS);
   return _mm_packs_epi32(v_res_lo_d, v_res_hi_d);
 }
 // Apply the filter to the contents of the lower half of a and b
 static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w,
                                           const __m128i v_b_lo_w,
                                           const __m128i v_filter_w,
-                                          __m128i* v_res_d) {
+                                          __m128i *v_res_d) {
   const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
   __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w);
   __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w);
-  *v_res_d = _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d),
-                            FILTER_BITS);
+  *v_res_d =
+      _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
 }
 
 static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w,
-                               const __m128i v_m_b, __m128i* v_sum_d,
-                               __m128i* v_sse_q) {
+                               const __m128i v_m_b, __m128i *v_sum_d,
+                               __m128i *v_sse_q) {
   const __m128i v_zero = _mm_setzero_si128();
   const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
 
@@ -1461,11 +1396,8 @@ static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w,
   *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
 }
 
-static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d,
-                                                      __m128i v_sse_q,
-                                                      uint32_t* sse,
-                                                      const int w,
-                                                      const int h) {
+static INLINE uint32_t highbd_10_calc_masked_variance(
+    __m128i v_sum_d, __m128i v_sse_q, uint32_t *sse, const int w, const int h) {
   int64_t sum64;
   uint64_t sse64;
 
@@ -1486,13 +1418,10 @@ static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d,
   // Store the SSE
   *sse = (uint32_t)sse64;
   // Compute the variance
-  return  *sse - (uint32_t)((sum64 * sum64) / (w * h));
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
 }
-static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d,
-                                                      __m128i v_sse_q,
-                                                      uint32_t* sse,
-                                                      const int w,
-                                                      const int h) {
+static INLINE uint32_t highbd_12_calc_masked_variance(
+    __m128i v_sum_d, __m128i v_sse_q, uint32_t *sse, const int w, const int h) {
   int64_t sum64;
   uint64_t sse64;
 
@@ -1513,43 +1442,42 @@ static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d,
   // Store the SSE
   *sse = (uint32_t)sse64;
   // Compute the variance
-  return  *sse - (uint32_t)((sum64 * sum64) / (w * h));
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
 }
 
-
 // High bit depth functions for width (W) >= 8
 unsigned int vpx_highbd_masked_subpel_varWxH_xzero(
-        const uint16_t *src, int src_stride, int  yoffset,
-        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int w, int h, highbd_filter_fn_t filter_fn,
-        highbd_calc_masked_var_t calc_var) {
+    const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int w, int h, highbd_filter_fn_t filter_fn,
+    highbd_calc_masked_var_t calc_var) {
   int i, j;
   __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_w = _mm_set1_epi32((
-        bilinear_filters_2t[yoffset][1] << 16) +
-        bilinear_filters_2t[yoffset][0]);
+  const __m128i v_filter_w =
+      _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                     bilinear_filters_2t[yoffset][0]);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 8) {
     // Load the first row ready
-    v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
     // Process 2 rows at a time
     for (i = 0; i < h; i += 2) {
       // Load the next row apply the filter
-      v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + src_stride));
+      v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
       v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
       // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j));
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
       highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
 
       // Load the next row apply the filter
-      v_src0_w = _mm_loadu_si128((const __m128i*)(src + j + src_stride * 2));
+      v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
       v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w);
       // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j + dst_stride));
-      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j + msk_stride));
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride));
       highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
       // Move onto the next block of rows
       src += src_stride * 2;
@@ -1564,28 +1492,28 @@ unsigned int vpx_highbd_masked_subpel_varWxH_xzero(
   return calc_var(v_sum_d, v_sse_q, sse, w, h);
 }
 unsigned int vpx_highbd_masked_subpel_varWxH_yzero(
-        const uint16_t *src, int src_stride, int xoffset,
-        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int w, int h, highbd_filter_fn_t filter_fn,
-        highbd_calc_masked_var_t calc_var) {
+    const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int w, int h, highbd_filter_fn_t filter_fn,
+    highbd_calc_masked_var_t calc_var) {
   int i, j;
   __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filter_w = _mm_set1_epi32((
-        bilinear_filters_2t[xoffset][1] << 16) +
-        bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filter_w =
+      _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                     bilinear_filters_2t[xoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j += 8) {
       // Load this row & apply the filter to them
-      v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
-      v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + 1));
+      v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+      v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
       v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
 
       // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j));
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
       highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
     }
     src += src_stride;
@@ -1596,49 +1524,49 @@ unsigned int vpx_highbd_masked_subpel_varWxH_yzero(
 }
 
 unsigned int vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(
-        const uint16_t *src, int src_stride, int xoffset, int yoffset,
-        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn,
-        highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) {
+    const uint16_t *src, int src_stride, int xoffset, int yoffset,
+    const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn,
+    highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) {
   int i, j;
   __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w;
   __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  const __m128i v_filterx_w = _mm_set1_epi32((
-        bilinear_filters_2t[xoffset][1] << 16) +
-        bilinear_filters_2t[xoffset][0]);
-  const __m128i v_filtery_w = _mm_set1_epi32((
-        bilinear_filters_2t[yoffset][1] << 16) +
-        bilinear_filters_2t[yoffset][0]);
+  const __m128i v_filterx_w =
+      _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                     bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filtery_w =
+      _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                     bilinear_filters_2t[yoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   for (j = 0; j < w; j += 8) {
     // Load the first row ready
-    v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
-    v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + 1));
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
     v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
     // Process 2 rows at a time
     for (i = 0; i < h; i += 2) {
       // Load the next row & apply the filter
-      v_src2_w = _mm_loadu_si128((const __m128i*)(src + src_stride + j));
-      v_src3_w = _mm_loadu_si128((const __m128i*)(src + src_stride + j + 1));
+      v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
+      v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
       v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w);
       // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j));
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
       // Complete the calculation for this row and add it to the running total
       v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w);
       highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
 
       // Load the next row & apply the filter
-      v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + j));
-      v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 +
-                                                  j + 1));
+      v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
+      v_src1_w =
+          _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
       v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
       // Load the dst and msk for the variance calculation
-      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + dst_stride + j));
-      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + msk_stride + j));
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j));
       // Complete the calculation for this row and add it to the running total
       v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w);
       highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
@@ -1657,47 +1585,46 @@ unsigned int vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(
 
 // Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
 unsigned int vpx_highbd_masked_subpel_var4xH_xzero(
-        const uint16_t *src, int src_stride, int yoffset,
-        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+    const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int h, highbd_calc_masked_var_t calc_var) {
   int i;
   __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w;
   __m128i v_dst_w, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_w = _mm_set1_epi32((
-        bilinear_filters_2t[yoffset][1] << 16) +
-        bilinear_filters_2t[yoffset][0]);
+  __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                                      bilinear_filters_2t[yoffset][0]);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first row of src data ready
-  v_src0_w = _mm_loadl_epi64((const __m128i*)src);
+  v_src0_w = _mm_loadl_epi64((const __m128i *)src);
   for (i = 0; i < h; i += 2) {
     if (yoffset == HALF_PIXEL_OFFSET) {
       // Load the rest of the source data for these rows
       v_src1_w = _mm_or_si128(
-            _mm_slli_si128(v_src0_w, 8),
-            _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)));
+          _mm_slli_si128(v_src0_w, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
       v_src0_w = _mm_or_si128(
-            _mm_slli_si128(v_src1_w, 8),
-            _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)));
+          _mm_slli_si128(v_src1_w, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
       // Apply the y filter
       v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w);
     } else {
       // Load the data and apply the y filter
-      v_src1_w = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+      v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
       highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d);
-      v_src0_w = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+      v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
       highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d);
       v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d);
     }
     // Load the dst data
     v_dst_w = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)),
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi32(
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)),
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
     // Compute the sum and SSE
     highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
     // Move onto the next set of rows
@@ -1709,23 +1636,22 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xzero(
 }
 
 unsigned int vpx_highbd_masked_subpel_var4xH_yzero(
-        const uint16_t *src, int src_stride, int xoffset,
-        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+    const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int h, highbd_calc_masked_var_t calc_var) {
   int i;
   __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d;
   __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filter_w = _mm_set1_epi32((
-        bilinear_filters_2t[xoffset][1] << 16) +
-        bilinear_filters_2t[xoffset][0]);
+  __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                                      bilinear_filters_2t[xoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   for (i = 0; i < h; i += 2) {
     // Load the src data
-    v_src0_w = _mm_loadu_si128((const __m128i*)(src));
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src));
     v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-    v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
     if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1741,12 +1667,12 @@ unsigned int vpx_highbd_masked_subpel_var4xH_yzero(
     }
     // Load the dst data
     v_dst_w = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)),
-            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi32(
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)),
-            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
     // Compute the sum and SSE
     highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
     // Move onto the next set of rows
@@ -1758,27 +1684,25 @@ unsigned int vpx_highbd_masked_subpel_var4xH_yzero(
 }
 
 unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
-        const uint16_t *src, int src_stride, int xoffset, int  yoffset,
-        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
-        unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+    const uint16_t *src, int src_stride, int xoffset, int yoffset,
+    const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
   int i;
   __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b;
   __m128i v_src0_shift_w, v_src1_shift_w;
   __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w;
   __m128i v_sum_d = _mm_setzero_si128();
   __m128i v_sse_q = _mm_setzero_si128();
-  __m128i v_filterx_w = _mm_set1_epi32((
-        bilinear_filters_2t[xoffset][1] << 16) +
-        bilinear_filters_2t[xoffset][0]);
-  __m128i v_filtery_w = _mm_set1_epi32((
-        bilinear_filters_2t[yoffset][1] << 16) +
-        bilinear_filters_2t[yoffset][0]);
+  __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                                       bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                                       bilinear_filters_2t[yoffset][0]);
   assert(xoffset < BIL_SUBPEL_SHIFTS);
   assert(yoffset < BIL_SUBPEL_SHIFTS);
   // Load the first block of src data
-  v_src0_w = _mm_loadu_si128((const __m128i*)(src));
+  v_src0_w = _mm_loadu_si128((const __m128i *)(src));
   v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-  v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
+  v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
   v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
   // Apply the x filter
   if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1794,9 +1718,9 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
   }
   for (i = 0; i < h; i += 4) {
     // Load the next block of src data
-    v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2));
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
     v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-    v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
     if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1820,19 +1744,19 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
     }
     // Load the dst data
     v_dst_w = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi32(
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
     // Compute the sum and SSE
     highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
 
     // Load the next block of src data
-    v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 4));
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
     v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
-    v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
     v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
     // Apply the x filter
     if (xoffset == HALF_PIXEL_OFFSET) {
@@ -1856,12 +1780,12 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
     }
     // Load the dst data
     v_dst_w = _mm_unpacklo_epi64(
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
-            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
     // Load the mask data
     v_msk_b = _mm_unpacklo_epi32(
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
-            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
     // Compute the sum and SSE
     highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
     // Move onto the next set of rows
@@ -1874,145 +1798,115 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
 
 // For W >=8
 #define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H)                                     \
-unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                \
-        const uint8_t *src8, int src_stride,                                   \
-        int xoffset, int  yoffset,                                             \
-        const uint8_t *dst8, int dst_stride,                                   \
-        const uint8_t *msk, int msk_stride,                                    \
-        unsigned int *sse,                                                     \
-        highbd_calc_masked_var_t calc_var,                                     \
-        highbd_variance_fn_t full_variance_function) {                         \
-  uint16_t* src = CONVERT_TO_SHORTPTR(src8);                                   \
-  uint16_t* dst = CONVERT_TO_SHORTPTR(dst8);                                   \
-  assert(W % 8 == 0);                                                          \
-  if (xoffset == 0) {                                                          \
-    if (yoffset == 0)                                                          \
-      return full_variance_function(src8, src_stride, dst8, dst_stride,        \
-                                    msk, msk_stride, sse);                     \
-    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
-      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
-                                                   HALF_PIXEL_OFFSET,          \
-                                                   dst, dst_stride,            \
-                                                   msk, msk_stride,            \
-                                                   sse, W, H,                  \
-                                                   highbd_apply_filter_avg,    \
-                                                   calc_var);                  \
-    else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
-                                                   yoffset,                    \
-                                                   dst, dst_stride,            \
-                                                   msk, msk_stride,            \
-                                                   sse, W, H,                  \
-                                                   highbd_apply_filter,        \
-                                                   calc_var);                  \
-  } else if (yoffset == 0) {                                                   \
-    if (xoffset == HALF_PIXEL_OFFSET)                                          \
-      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
-                                                   HALF_PIXEL_OFFSET,          \
-                                                   dst, dst_stride,            \
-                                                   msk, msk_stride,            \
-                                                   sse, W, H,                  \
-                                                   highbd_apply_filter_avg,    \
-                                                   calc_var);                  \
-    else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
-                                                   xoffset,                    \
-                                                   dst, dst_stride,            \
-                                                   msk, msk_stride,            \
-                                                   sse, W, H,                  \
-                                                   highbd_apply_filter,        \
-                                                   calc_var);                  \
-  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
-    if (yoffset == HALF_PIXEL_OFFSET)                                          \
-      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET,           \
-              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
-              highbd_apply_filter_avg, highbd_apply_filter_avg, calc_var);     \
-    else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride,    \
-              msk, msk_stride, sse, W, H, highbd_apply_filter_avg,             \
-              highbd_apply_filter, calc_var);                                  \
-  } else {                                                                     \
-    if (yoffset == HALF_PIXEL_OFFSET)                                          \
-      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, xoffset, HALF_PIXEL_OFFSET,                     \
-              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
-              highbd_apply_filter, highbd_apply_filter_avg, calc_var);         \
-    else                                                                       \
-      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
-              src, src_stride, xoffset, yoffset,                               \
-              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
-              highbd_apply_filter, highbd_apply_filter, calc_var);             \
-  }                                                                            \
-}
+  unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(              \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse, highbd_calc_masked_var_t calc_var,                    \
+      highbd_variance_fn_t full_variance_function) {                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    assert(W % 8 == 0);                                                        \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0)                                                        \
+        return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \
+                                      msk_stride, sse);                        \
+      else if (yoffset == HALF_PIXEL_OFFSET)                                   \
+        return vpx_highbd_masked_subpel_varWxH_xzero(                          \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var);         \
+      else                                                                     \
+        return vpx_highbd_masked_subpel_varWxH_xzero(                          \
+            src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse,   \
+            W, H, highbd_apply_filter, calc_var);                              \
+    } else if (yoffset == 0) {                                                 \
+      if (xoffset == HALF_PIXEL_OFFSET)                                        \
+        return vpx_highbd_masked_subpel_varWxH_yzero(                          \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var);         \
+      else                                                                     \
+        return vpx_highbd_masked_subpel_varWxH_yzero(                          \
+            src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse,   \
+            W, H, highbd_apply_filter, calc_var);                              \
+    } else if (xoffset == HALF_PIXEL_OFFSET) {                                 \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst,        \
+            dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg,   \
+            highbd_apply_filter_avg, calc_var);                                \
+      else                                                                     \
+        return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, highbd_apply_filter_avg,                    \
+            highbd_apply_filter, calc_var);                                    \
+    } else {                                                                   \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, highbd_apply_filter,                        \
+            highbd_apply_filter_avg, calc_var);                                \
+      else                                                                     \
+        return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, xoffset, yoffset, dst, dst_stride, msk,           \
+            msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter,   \
+            calc_var);                                                         \
+    }                                                                          \
+  }
 
 // For W < 8
 #define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H)                                     \
-unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                \
-        const uint8_t *src8, int src_stride,                                   \
-        int xoffset, int  yoffset,                                             \
-        const uint8_t *dst8, int dst_stride,                                   \
-        const uint8_t *msk, int msk_stride,                                    \
-        unsigned int *sse,                                                     \
-        highbd_calc_masked_var_t calc_var,                                     \
-        highbd_variance_fn_t full_variance_function) {                         \
-  uint16_t* src = CONVERT_TO_SHORTPTR(src8);                                   \
-  uint16_t* dst = CONVERT_TO_SHORTPTR(dst8);                                   \
-  assert(W == 4);                                                              \
-  if (xoffset == 0 && yoffset == 0)                                            \
-    return full_variance_function(src8, src_stride, dst8, dst_stride,          \
-                                  msk, msk_stride, sse);                       \
-  else if (xoffset == 0)                                                       \
-    return vpx_highbd_masked_subpel_var4xH_xzero(src, src_stride, yoffset,     \
-                                                     dst, dst_stride,          \
-                                                     msk, msk_stride, sse, H,  \
-                                                     calc_var);                \
-  else if (yoffset == 0)                                                       \
-    return vpx_highbd_masked_subpel_var4xH_yzero(src, src_stride, xoffset,     \
-                                                     dst, dst_stride,          \
-                                                     msk, msk_stride, sse, H,  \
-                                                     calc_var);                \
-  else                                                                         \
-    return vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(                  \
-          src, src_stride, xoffset, yoffset, dst, dst_stride,                  \
-          msk, msk_stride, sse, H, calc_var);                                  \
-}
+  unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(              \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse, highbd_calc_masked_var_t calc_var,                    \
+      highbd_variance_fn_t full_variance_function) {                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    assert(W == 4);                                                            \
+    if (xoffset == 0 && yoffset == 0)                                          \
+      return full_variance_function(src8, src_stride, dst8, dst_stride, msk,   \
+                                    msk_stride, sse);                          \
+    else if (xoffset == 0)                                                     \
+      return vpx_highbd_masked_subpel_var4xH_xzero(                            \
+          src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H,  \
+          calc_var);                                                           \
+    else if (yoffset == 0)                                                     \
+      return vpx_highbd_masked_subpel_var4xH_yzero(                            \
+          src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H,  \
+          calc_var);                                                           \
+    else                                                                       \
+      return vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(                \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
+          sse, H, calc_var);                                                   \
+  }
 
 #define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H)                                  \
-unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_ssse3(            \
-        const uint8_t *src8, int src_stride,                                   \
-        int xoffset, int  yoffset,                                             \
-        const uint8_t *dst8, int dst_stride,                                   \
-        const uint8_t *msk, int msk_stride,                                    \
-        unsigned int *sse) {                                                   \
-    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \
-            xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse,          \
-            calc_masked_variance,                                              \
-            vpx_highbd_masked_variance##W##x##H##_ssse3);                      \
-}                                                                              \
-unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(         \
-        const uint8_t *src8, int src_stride,                                   \
-        int xoffset, int  yoffset,                                             \
-        const uint8_t *dst8, int dst_stride,                                   \
-        const uint8_t *msk, int msk_stride,                                    \
-        unsigned int *sse) {                                                   \
-    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \
-            xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse,          \
-            highbd_10_calc_masked_variance,                                    \
-            vpx_highbd_10_masked_variance##W##x##H##_ssse3);                   \
-}                                                                              \
-unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(         \
-        const uint8_t *src8, int src_stride,                                   \
-        int xoffset, int  yoffset,                                             \
-        const uint8_t *dst8, int dst_stride,                                   \
-        const uint8_t *msk, int msk_stride,                                    \
-        unsigned int *sse) {                                                   \
-    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \
-            xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse,          \
-            highbd_12_calc_masked_variance,                                    \
-            vpx_highbd_12_masked_variance##W##x##H##_ssse3);                   \
-}                                                                              \
+  unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_ssse3(          \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                     \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
+        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+        sse, calc_masked_variance,                                             \
+        vpx_highbd_masked_variance##W##x##H##_ssse3);                          \
+  }                                                                            \
+  unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(       \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                     \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
+        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+        sse, highbd_10_calc_masked_variance,                                   \
+        vpx_highbd_10_masked_variance##W##x##H##_ssse3);                       \
+  }                                                                            \
+  unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(       \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                     \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
+        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+        sse, highbd_12_calc_masked_variance,                                   \
+        vpx_highbd_12_masked_variance##W##x##H##_ssse3);                       \
+  }
 
 HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4)
 HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4)
diff --git a/vpx_dsp/x86/obmc_sad_sse4.c b/vpx_dsp/x86/obmc_sad_sse4.c
index e21bb98c140418e068d7e3cbcd9c6a77b074adae..8a1581c19db17338c60744eca855e70266aeceea 100644
--- a/vpx_dsp/x86/obmc_sad_sse4.c
+++ b/vpx_dsp/x86/obmc_sad_sse4.c
@@ -22,10 +22,8 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
-                                       const int pre_stride,
-                                       const int32_t *wsrc,
-                                       const int32_t *mask,
+static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
+                                       const int32_t *wsrc, const int32_t *mask,
                                        const int height) {
   const int pre_step = pre_stride - 4;
   int n = 0;
@@ -62,8 +60,7 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
 static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
                                         const int pre_stride,
                                         const int32_t *wsrc,
-                                        const int32_t *mask,
-                                        const int width,
+                                        const int32_t *mask, const int width,
                                         const int height) {
   const int pre_step = pre_stride - width;
   int n = 0;
@@ -109,17 +106,16 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-#define OBMCSADWXH(w, h)                                                      \
-unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre,               \
-                                            int pre_stride,                   \
-                                            const int32_t *wsrc,              \
-                                            const int32_t *msk) {             \
-  if (w == 4) {                                                               \
-    return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);                        \
-  } else {                                                                    \
-    return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);                    \
-  }                                                                           \
-}
+#define OBMCSADWXH(w, h)                                       \
+  unsigned int vpx_obmc_sad##w##x##h##_sse4_1(                 \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+      const int32_t *msk) {                                    \
+    if (w == 4) {                                              \
+      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
+    } else {                                                   \
+      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
+    }                                                          \
+  }
 
 #if CONFIG_EXT_PARTITION
 OBMCSADWXH(128, 128)
@@ -187,8 +183,7 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
                                             const int pre_stride,
                                             const int32_t *wsrc,
                                             const int32_t *mask,
-                                            const int width,
-                                            const int height) {
+                                            const int width, const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - width;
   int n = 0;
@@ -234,17 +229,16 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-#define HBD_OBMCSADWXH(w, h)                                                  \
-unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre,        \
-                                                   int pre_stride,            \
-                                                   const int32_t *wsrc,       \
-                                                   const int32_t *mask) {     \
-  if (w == 4) {                                                               \
-    return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);                   \
-  } else {                                                                    \
-    return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h);               \
-  }                                                                           \
-}
+#define HBD_OBMCSADWXH(w, h)                                      \
+  unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(             \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *mask) {                                      \
+    if (w == 4) {                                                 \
+      return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                      \
+      return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                             \
+  }
 
 #if CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(128, 128)
diff --git a/vpx_dsp/x86/obmc_variance_sse4.c b/vpx_dsp/x86/obmc_variance_sse4.c
index b967c10d5916971c0d11a1cb961b007a9612834d..616db27a6fa456d81516d1abad6ab48fe3a72a69 100644
--- a/vpx_dsp/x86/obmc_variance_sse4.c
+++ b/vpx_dsp/x86/obmc_variance_sse4.c
@@ -23,12 +23,9 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE void obmc_variance_w4(const uint8_t *pre,
-                                    const int pre_stride,
-                                    const int32_t *wsrc,
-                                    const int32_t *mask,
-                                    unsigned int *const sse,
-                                    int *const sum,
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    unsigned int *const sse, int *const sum,
                                     const int h) {
   const int pre_step = pre_stride - 4;
   int n = 0;
@@ -65,14 +62,10 @@ static INLINE void obmc_variance_w4(const uint8_t *pre,
   *sse = xx_hsum_epi32_si32(v_sse_d);
 }
 
-static INLINE void obmc_variance_w8n(const uint8_t *pre,
-                                     const int pre_stride,
-                                     const int32_t *wsrc,
-                                     const int32_t *mask,
-                                     unsigned int *const sse,
-                                     int *const sum,
-                                     const int w,
-                                     const int h) {
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
   const int pre_step = pre_stride - w;
   int n = 0;
   __m128i v_sum_d = _mm_setzero_si128();
@@ -120,20 +113,18 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre,
   *sse = xx_hsum_epi32_si32(v_sse_d);
 }
 
-#define OBMCVARWXH(W, H)                                                      \
-unsigned int vpx_obmc_variance##W##x##H##_sse4_1(const uint8_t *pre,          \
-                                                 int pre_stride,              \
-                                                 const int32_t *wsrc,         \
-                                                 const int32_t *mask,         \
-                                                 unsigned int *sse) {         \
-  int sum;                                                                    \
-  if (W == 4) {                                                               \
-    obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);              \
-  } else {                                                                    \
-    obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);          \
-  }                                                                           \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}
+#define OBMCVARWXH(W, H)                                               \
+  unsigned int vpx_obmc_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask, unsigned int *sse) {                        \
+    int sum;                                                           \
+    if (W == 4) {                                                      \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
+    } else {                                                           \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                  \
+    return *sse - (((int64_t)sum * sum) / (W * H));                    \
+  }
 
 #if CONFIG_EXT_PARTITION
 OBMCVARWXH(128, 128)
@@ -159,13 +150,9 @@ OBMCVARWXH(4, 4)
 ////////////////////////////////////////////////////////////////////////////////
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void hbd_obmc_variance_w4(const uint8_t *pre8,
-                                        const int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask,
-                                        uint64_t *const sse,
-                                        int64_t *const sum,
-                                        const int h) {
+static INLINE void hbd_obmc_variance_w4(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - 4;
   int n = 0;
@@ -202,14 +189,10 @@ static INLINE void hbd_obmc_variance_w4(const uint8_t *pre8,
   *sse = xx_hsum_epi32_si32(v_sse_d);
 }
 
-static INLINE void hbd_obmc_variance_w8n(const uint8_t *pre8,
-                                         const int pre_stride,
-                                         const int32_t *wsrc,
-                                         const int32_t *mask,
-                                         uint64_t *const sse,
-                                         int64_t *const sum,
-                                         const int w,
-                                         const int h) {
+static INLINE void hbd_obmc_variance_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
+    const int h) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - w;
   int n = 0;
@@ -260,8 +243,7 @@ static INLINE void hbd_obmc_variance_w8n(const uint8_t *pre8,
 
 static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
                                         const int32_t *wsrc,
-                                        const int32_t *mask,
-                                        int w, int h,
+                                        const int32_t *mask, int w, int h,
                                         unsigned int *sse, int *sum) {
   int64_t sum64 = 0;
   uint64_t sse64 = 0;
@@ -276,8 +258,7 @@ static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
 
 static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           int w, int h,
+                                           const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
   int64_t sum64 = 0;
   uint64_t sse64 = 0;
@@ -292,15 +273,14 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
 
 static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           int w, int h,
+                                           const int32_t *mask, int w, int h,
                                            unsigned int *sse, int *sum) {
   int64_t sum64 = 0;
   uint64_t sse64 = 0;
   if (w == 128) {
     do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask,
-                            &sse64, &sum64, 128, 32);
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128,
+                            32);
       pre8 += 32 * pre_stride;
       wsrc += 32 * 128;
       mask += 32 * 128;
@@ -308,8 +288,8 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
     } while (h > 0);
   } else if (w == 64 && h >= 128) {
     do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask,
-                            &sse64, &sum64, 64, 64);
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64,
+                            64);
       pre8 += 64 * pre_stride;
       wsrc += 64 * 64;
       mask += 64 * 64;
@@ -324,39 +304,30 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
 }
 
-#define HBD_OBMCVARWXH(W, H)                                                  \
-unsigned int vpx_highbd_obmc_variance##W##x##H##_sse4_1(                      \
-    const uint8_t *pre,                                                       \
-    int pre_stride,                                                           \
-    const int32_t *wsrc,                                                      \
-    const int32_t *mask,                                                      \
-    unsigned int *sse) {                                                      \
-  int sum;                                                                    \
-  highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);         \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}                                                                             \
-                                                                              \
-unsigned int vpx_highbd_10_obmc_variance##W##x##H##_sse4_1(                   \
-    const uint8_t *pre,                                                       \
-    int pre_stride,                                                           \
-    const int32_t *wsrc,                                                      \
-    const int32_t *mask,                                                      \
-    unsigned int *sse) {                                                      \
-  int sum;                                                                    \
-  highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}                                                                             \
-                                                                              \
-unsigned int vpx_highbd_12_obmc_variance##W##x##H##_sse4_1(                   \
-    const uint8_t *pre,                                                       \
-    int pre_stride,                                                           \
-    const int32_t *wsrc,                                                      \
-    const int32_t *mask,                                                      \
-    unsigned int *sse) {                                                      \
-  int sum;                                                                    \
-  highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
-  return *sse - (((int64_t)sum * sum) / (W * H));                             \
-}
+#define HBD_OBMCVARWXH(W, H)                                               \
+  unsigned int vpx_highbd_obmc_variance##W##x##H##_sse4_1(                 \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    return *sse - (((int64_t)sum * sum) / (W * H));                        \
+  }                                                                        \
+                                                                           \
+  unsigned int vpx_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    return *sse - (((int64_t)sum * sum) / (W * H));                        \
+  }                                                                        \
+                                                                           \
+  unsigned int vpx_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    return *sse - (((int64_t)sum * sum) / (W * H));                        \
+  }
 
 #if CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(128, 128)
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 8aa4568d674c95c982f8adb1e90870278ea5501c..2c7e431c745a74bb914e8b6d1d33260f940cd76f 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -17,8 +17,9 @@
 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
 #if CONFIG_VP9_HIGHBITDEPTH
   return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-      (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4],
-      (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
 #else
   return _mm_load_si128((const __m128i *)coeff_ptr);
 #endif
@@ -32,21 +33,20 @@ static INLINE void store_coefficients(__m128i coeff_vals,
   __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
   __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
   __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
 #else
-  _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
 #endif
 }
 
-void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t* zbin_ptr,
-                         const int16_t* round_ptr, const int16_t* quant_ptr,
-                         const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
-                         tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                         uint16_t* eob_ptr,
-                         const int16_t* scan_ptr,
-                         const int16_t* iscan_ptr) {
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t *zbin_ptr,
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan_ptr,
+                         const int16_t *iscan_ptr) {
   __m128i zero;
   (void)scan_ptr;
 
@@ -66,13 +66,13 @@ void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
       // Setup global values
       {
         __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
-        round = _mm_load_si128((const __m128i*)round_ptr);
-        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+        round = _mm_load_si128((const __m128i *)round_ptr);
+        quant = _mm_load_si128((const __m128i *)quant_ptr);
         pw_1 = _mm_set1_epi16(1);
         zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
-        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
+        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
       }
 
       {
@@ -138,8 +138,8 @@ void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
         // Add one to convert from indices to counts
         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -211,8 +211,8 @@ void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
         // Add one to convert from indices to counts
         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 793658f9ea937098aa17394e0de65d0ce7485ce9..962b8fb11a423dacbb23b130285deb915027f332 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -11,10 +11,8 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src,
-                          int src_stride,
-                          const uint8_t *const ref[4],
-                          int ref_stride,
+void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                           uint32_t res[4]) {
   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
@@ -30,7 +28,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src,
   sum_ref1 = _mm256_set1_epi16(0);
   sum_ref2 = _mm256_set1_epi16(0);
   sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 32 ; i++) {
+  for (i = 0; i < 32; i++) {
     // load src and all refs
     src_reg = _mm256_loadu_si256((const __m256i *)src);
     ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
@@ -48,11 +46,11 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src,
     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
 
-    src+= src_stride;
-    ref0+= ref_stride;
-    ref1+= ref_stride;
-    ref2+= ref_stride;
-    ref3+= ref_stride;
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
   }
   {
     __m128i sum;
@@ -81,10 +79,8 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src,
   }
 }
 
-void vpx_sad64x64x4d_avx2(const uint8_t *src,
-                          int src_stride,
-                          const uint8_t *const ref[4],
-                          int ref_stride,
+void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                           uint32_t res[4]) {
   __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
   __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
@@ -102,7 +98,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src,
   sum_ref1 = _mm256_set1_epi16(0);
   sum_ref2 = _mm256_set1_epi16(0);
   sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 64 ; i++) {
+  for (i = 0; i < 64; i++) {
     // load 64 bytes from src and all refs
     src_reg = _mm256_loadu_si256((const __m256i *)src);
     srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
@@ -133,11 +129,11 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src,
     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
-    src+= src_stride;
-    ref0+= ref_stride;
-    ref1+= ref_stride;
-    ref2+= ref_stride;
-    ref3+= ref_stride;
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
   }
   {
     __m128i sum;
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index ce9ad8f780c244d2e2f52fe7b90dcf396f9836ec..d94413430549279b49193dc2237044b2fd1ea093 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,75 +11,74 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-#define FSAD64_H(h) \
-unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \
-                                  int src_stride, \
-                                  const uint8_t *ref_ptr, \
-                                  int ref_stride) { \
-  int i, res; \
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
-  __m256i sum_sad = _mm256_setzero_si256(); \
-  __m256i sum_sad_h; \
-  __m128i sum_sad128; \
-  for (i = 0 ; i < h ; i++) { \
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
-    sad1_reg = _mm256_sad_epu8(ref1_reg, \
-               _mm256_loadu_si256((__m256i const *)src_ptr)); \
-    sad2_reg = _mm256_sad_epu8(ref2_reg, \
-               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
-    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
-    ref_ptr+= ref_stride; \
-    src_ptr+= src_stride; \
-  } \
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
-  res = _mm_cvtsi128_si32(sum_sad128); \
-  return res; \
-}
+#define FSAD64_H(h)                                                           \
+  unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    for (i = 0; i < h; i++) {                                                 \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref_stride;                                                  \
+      src_ptr += src_stride;                                                  \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    return res;                                                               \
+  }
 
-#define FSAD32_H(h) \
-unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \
-                                  int src_stride, \
-                                  const uint8_t *ref_ptr, \
-                                  int ref_stride) { \
-  int i, res; \
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
-  __m256i sum_sad = _mm256_setzero_si256(); \
-  __m256i sum_sad_h; \
-  __m128i sum_sad128; \
-  int ref2_stride = ref_stride << 1; \
-  int src2_stride = src_stride << 1; \
-  int max = h >> 1; \
-  for (i = 0 ; i < max ; i++) { \
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-    sad1_reg = _mm256_sad_epu8(ref1_reg, \
-               _mm256_loadu_si256((__m256i const *)src_ptr)); \
-    sad2_reg = _mm256_sad_epu8(ref2_reg, \
-               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
-    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
-    ref_ptr+= ref2_stride; \
-    src_ptr+= src2_stride; \
-  } \
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
-  res = _mm_cvtsi128_si32(sum_sad128); \
-  return res; \
-}
+#define FSAD32_H(h)                                                           \
+  unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    int ref2_stride = ref_stride << 1;                                        \
+    int src2_stride = src_stride << 1;                                        \
+    int max = h >> 1;                                                         \
+    for (i = 0; i < max; i++) {                                               \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg,                                                           \
+          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref2_stride;                                                 \
+      src_ptr += src2_stride;                                                 \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    return res;                                                               \
+  }
 
-#define FSAD64 \
-FSAD64_H(64); \
-FSAD64_H(32);
+#define FSAD64  \
+  FSAD64_H(64); \
+  FSAD64_H(32);
 
-#define FSAD32 \
-FSAD32_H(64); \
-FSAD32_H(32); \
-FSAD32_H(16);
+#define FSAD32  \
+  FSAD32_H(64); \
+  FSAD32_H(32); \
+  FSAD32_H(16);
 
 FSAD64;
 FSAD32;
@@ -89,88 +88,86 @@ FSAD32;
 #undef FSAD64_H
 #undef FSAD32_H
 
-#define FSADAVG64_H(h) \
-unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
-                                      int src_stride, \
-                                      const uint8_t *ref_ptr, \
-                                      int  ref_stride, \
-                                      const uint8_t *second_pred) { \
-  int i, res; \
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
-  __m256i sum_sad = _mm256_setzero_si256(); \
-  __m256i sum_sad_h; \
-  __m128i sum_sad128; \
-  for (i = 0 ; i < h ; i++) { \
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
-    ref1_reg = _mm256_avg_epu8(ref1_reg, \
-               _mm256_loadu_si256((__m256i const *)second_pred)); \
-    ref2_reg = _mm256_avg_epu8(ref2_reg, \
-               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
-    sad1_reg = _mm256_sad_epu8(ref1_reg, \
-               _mm256_loadu_si256((__m256i const *)src_ptr)); \
-    sad2_reg = _mm256_sad_epu8(ref2_reg, \
-               _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
-    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
-    ref_ptr+= ref_stride; \
-    src_ptr+= src_stride; \
-    second_pred+= 64; \
-  } \
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
-  res = _mm_cvtsi128_si32(sum_sad128); \
-  return res; \
-}
+#define FSADAVG64_H(h)                                                        \
+  unsigned int vpx_sad64x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    for (i = 0; i < h; i++) {                                                 \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref_stride;                                                  \
+      src_ptr += src_stride;                                                  \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    return res;                                                               \
+  }
 
-#define FSADAVG32_H(h) \
-unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
-                                      int src_stride, \
-                                      const uint8_t *ref_ptr, \
-                                      int  ref_stride, \
-                                      const uint8_t *second_pred) { \
-  int i, res; \
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
-  __m256i sum_sad = _mm256_setzero_si256(); \
-  __m256i sum_sad_h; \
-  __m128i sum_sad128; \
-  int ref2_stride = ref_stride << 1; \
-  int src2_stride = src_stride << 1; \
-  int max = h >> 1; \
-  for (i = 0 ; i < max ; i++) { \
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-    ref1_reg = _mm256_avg_epu8(ref1_reg, \
-               _mm256_loadu_si256((__m256i const *)second_pred)); \
-    ref2_reg = _mm256_avg_epu8(ref2_reg, \
-               _mm256_loadu_si256((__m256i const *)(second_pred +32))); \
-    sad1_reg = _mm256_sad_epu8(ref1_reg, \
-               _mm256_loadu_si256((__m256i const *)src_ptr)); \
-    sad2_reg = _mm256_sad_epu8(ref2_reg, \
-               _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
-    sum_sad = _mm256_add_epi32(sum_sad, \
-              _mm256_add_epi32(sad1_reg, sad2_reg)); \
-    ref_ptr+= ref2_stride; \
-    src_ptr+= src2_stride; \
-    second_pred+= 64; \
-  } \
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
-  res = _mm_cvtsi128_si32(sum_sad128); \
-  return res; \
-}
+#define FSADAVG32_H(h)                                                        \
+  unsigned int vpx_sad32x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    int ref2_stride = ref_stride << 1;                                        \
+    int src2_stride = src_stride << 1;                                        \
+    int max = h >> 1;                                                         \
+    for (i = 0; i < max; i++) {                                               \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg,                                                           \
+          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref2_stride;                                                 \
+      src_ptr += src2_stride;                                                 \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    return res;                                                               \
+  }
 
-#define FSADAVG64 \
-FSADAVG64_H(64); \
-FSADAVG64_H(32);
+#define FSADAVG64  \
+  FSADAVG64_H(64); \
+  FSADAVG64_H(32);
 
-#define FSADAVG32 \
-FSADAVG32_H(64); \
-FSADAVG32_H(32); \
-FSADAVG32_H(16);
+#define FSADAVG32  \
+  FSADAVG32_H(64); \
+  FSADAVG32_H(32); \
+  FSADAVG32_H(16);
 
 FSADAVG64;
 FSADAVG32;
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
index 5ecd87e7387ee492c0421bebdae1ca2b76e44fae..3d24716f13b28797ba7da93fb2d87e1a0a6a3653 100644
--- a/vpx_dsp/x86/sum_squares_sse2.c
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -18,10 +18,14 @@
 
 static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
                                                 int stride) {
-  const __m128i v_val_0_w = _mm_loadl_epi64((const __m128i*)(src+0*stride));
-  const __m128i v_val_1_w = _mm_loadl_epi64((const __m128i*)(src+1*stride));
-  const __m128i v_val_2_w = _mm_loadl_epi64((const __m128i*)(src+2*stride));
-  const __m128i v_val_3_w = _mm_loadl_epi64((const __m128i*)(src+3*stride));
+  const __m128i v_val_0_w =
+      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+  const __m128i v_val_1_w =
+      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
+  const __m128i v_val_2_w =
+      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+  const __m128i v_val_3_w =
+      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
 
   const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
   const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
@@ -32,8 +36,8 @@ static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
   const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
   const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
 
-  const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d,
-                                        _mm_srli_epi64(v_sum_0123_d, 32));
+  const __m128i v_sum_d =
+      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
 
   return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
 }
@@ -44,9 +48,8 @@ static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
 // maintenance instructions in the common case of 4x4.
 __attribute__((noinline))
 #endif
-static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
-                                                int stride,
-                                                int size) {
+static uint64_t
+vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
   int r, c;
 
   const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
@@ -56,16 +59,24 @@ static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
     __m128i v_acc_d = _mm_setzero_si128();
 
     for (c = 0; c < size; c += 8) {
-      const int16_t *b = src+c;
-
-      const __m128i v_val_0_w = _mm_load_si128((const __m128i*)(b+0*stride));
-      const __m128i v_val_1_w = _mm_load_si128((const __m128i*)(b+1*stride));
-      const __m128i v_val_2_w = _mm_load_si128((const __m128i*)(b+2*stride));
-      const __m128i v_val_3_w = _mm_load_si128((const __m128i*)(b+3*stride));
-      const __m128i v_val_4_w = _mm_load_si128((const __m128i*)(b+4*stride));
-      const __m128i v_val_5_w = _mm_load_si128((const __m128i*)(b+5*stride));
-      const __m128i v_val_6_w = _mm_load_si128((const __m128i*)(b+6*stride));
-      const __m128i v_val_7_w = _mm_load_si128((const __m128i*)(b+7*stride));
+      const int16_t *b = src + c;
+
+      const __m128i v_val_0_w =
+          _mm_load_si128((const __m128i *)(b + 0 * stride));
+      const __m128i v_val_1_w =
+          _mm_load_si128((const __m128i *)(b + 1 * stride));
+      const __m128i v_val_2_w =
+          _mm_load_si128((const __m128i *)(b + 2 * stride));
+      const __m128i v_val_3_w =
+          _mm_load_si128((const __m128i *)(b + 3 * stride));
+      const __m128i v_val_4_w =
+          _mm_load_si128((const __m128i *)(b + 4 * stride));
+      const __m128i v_val_5_w =
+          _mm_load_si128((const __m128i *)(b + 5 * stride));
+      const __m128i v_val_6_w =
+          _mm_load_si128((const __m128i *)(b + 6 * stride));
+      const __m128i v_val_7_w =
+          _mm_load_si128((const __m128i *)(b + 7 * stride));
 
       const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
       const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
@@ -91,7 +102,7 @@ static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
 
-    src += 8*stride;
+    src += 8 * stride;
   }
 
   v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
@@ -101,21 +112,20 @@ static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
 #else
   {
     uint64_t tmp;
-    _mm_storel_epi64((__m128i*)&tmp, v_acc_q);
+    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
     return tmp;
   }
 #endif
 }
 
-uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride,
-                                     int size) {
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
   // 4 elements per row only requires half an XMM register, so this
   // must be a special case, but also note that over 75% of all calls
   // are with size == 4, so it is also the common case.
   if (LIKELY(size == 4)) {
     return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
   } else {
-  // Generic case
+    // Generic case
     return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
   }
 }
@@ -176,7 +186,7 @@ static uint64_t vpx_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
 #else
   {
     uint64_t tmp;
-    _mm_storel_epi64((__m128i*)&tmp, v_acc0_q);
+    _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
     return tmp;
   }
 #endif
@@ -186,7 +196,7 @@ uint64_t vpx_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
   if (n % 64 == 0) {
     return vpx_sum_squares_i16_64n_sse2(src, n);
   } else if (n > 64) {
-    int k = n & ~(64-1);
+    int k = n & ~(64 - 1);
     return vpx_sum_squares_i16_64n_sse2(src, k) +
            vpx_sum_squares_i16_c(src + k, n - k);
   } else {
diff --git a/vpx_dsp/x86/synonyms.h b/vpx_dsp/x86/synonyms.h
index 6708dd1101b0eb4e1170a5497497af53096b2e2e..fb4b9428b00716bcebfe708ea6f135adfb149d10 100644
--- a/vpx_dsp/x86/synonyms.h
+++ b/vpx_dsp/x86/synonyms.h
@@ -26,35 +26,35 @@
 // Loads and stores to do away with the tedium of casting the address
 // to the right type.
 static INLINE __m128i xx_loadl_32(const void *a) {
-  return _mm_cvtsi32_si128(*(const uint32_t*)a);
+  return _mm_cvtsi32_si128(*(const uint32_t *)a);
 }
 
 static INLINE __m128i xx_loadl_64(const void *a) {
-  return _mm_loadl_epi64((const __m128i*)a);
+  return _mm_loadl_epi64((const __m128i *)a);
 }
 
 static INLINE __m128i xx_load_128(const void *a) {
-  return _mm_load_si128((const __m128i*)a);
+  return _mm_load_si128((const __m128i *)a);
 }
 
 static INLINE __m128i xx_loadu_128(const void *a) {
-  return _mm_loadu_si128((const __m128i*)a);
+  return _mm_loadu_si128((const __m128i *)a);
 }
 
 static INLINE void xx_storel_32(void *const a, const __m128i v) {
-  *(uint32_t*)a = _mm_cvtsi128_si32(v);
+  *(uint32_t *)a = _mm_cvtsi128_si32(v);
 }
 
 static INLINE void xx_storel_64(void *const a, const __m128i v) {
-  _mm_storel_epi64((__m128i*)a, v);
+  _mm_storel_epi64((__m128i *)a, v);
 }
 
 static INLINE void xx_store_128(void *const a, const __m128i v) {
-  _mm_store_si128((__m128i*)a, v);
+  _mm_store_si128((__m128i *)a, v);
 }
 
 static INLINE void xx_storeu_128(void *const a, const __m128i v) {
-  _mm_storeu_si128((__m128i*)a, v);
+  _mm_storeu_si128((__m128i *)a, v);
 }
 
 static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
@@ -62,7 +62,7 @@ static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
 }
 
 static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
-  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
   return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
 }
 
@@ -75,8 +75,8 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
 static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
   const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
   const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
-  const __m128i v_tmp_d = _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d),
-                                        v_sign_d);
+  const __m128i v_tmp_d =
+      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
   return _mm_srai_epi32(v_tmp_d, bits);
 }
 
@@ -94,14 +94,14 @@ static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
 #else
   {
     int64_t tmp;
-    _mm_storel_epi64((__m128i*)&tmp, v_q);
+    _mm_storel_epi64((__m128i *)&tmp, v_q);
     return tmp;
   }
 #endif
 }
 
 static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
-  const __m128i v_sign_d =  _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
   const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
   const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
   return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
diff --git a/vpx_dsp/x86/txfm_common_sse2.h b/vpx_dsp/x86/txfm_common_sse2.h
index f886d30de57b63223ba505b24aa7f3384895c027..e148f5c8bf95051414bb270b042875d390994cea 100644
--- a/vpx_dsp/x86/txfm_common_sse2.h
+++ b/vpx_dsp/x86/txfm_common_sse2.h
@@ -14,15 +14,15 @@
 #include <emmintrin.h>
 #include "vpx/vpx_integer.h"
 
-#define pair_set_epi16(a, b) \
+#define pair_set_epi16(a, b)                                            \
   _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
-#define dual_set_epi16(a, b) \
+#define dual_set_epi16(a, b)                                            \
   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
 
-#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
   _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
                  (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
 
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index f8c97117d487ee46e654a9e7ac5f3cb0caa80d74..7bc2693cfbbfd0f41915fa4953610e128a83655c 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -14,13 +14,13 @@ typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
                              unsigned int *sse, int *sum);
 
 void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum);
+                          const uint8_t *ref, int ref_stride, unsigned int *sse,
+                          int *sum);
 
 static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int  ref_stride,
-                          int w, int h, unsigned int *sse, int *sum,
-                          get_var_avx2 var_fn, int block_size) {
+                          const uint8_t *ref, int ref_stride, int w, int h,
+                          unsigned int *sse, int *sum, get_var_avx2 var_fn,
+                          int block_size) {
   int i, j;
 
   *sse = 0;
@@ -30,21 +30,20 @@ static void variance_avx2(const uint8_t *src, int src_stride,
     for (j = 0; j < w; j += block_size) {
       unsigned int sse0;
       int sum0;
-      var_fn(&src[src_stride * i + j], src_stride,
-             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
+      var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
+             ref_stride, &sse0, &sum0);
       *sse += sse0;
       *sum += sum0;
     }
   }
 }
 
-
 unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
-                sse, &sum, vpx_get16x16var_avx2, 16);
+  variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                vpx_get16x16var_avx2, 16);
   return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
@@ -60,8 +59,8 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
-                sse, &sum, vpx_get32x32var_avx2, 32);
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
+                vpx_get32x32var_avx2, 32);
   return *sse - (((int64_t)sum * sum) >> 9);
 }
 
@@ -69,8 +68,8 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
-                sse, &sum, vpx_get32x32var_avx2, 32);
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
+                vpx_get32x32var_avx2, 32);
   return *sse - (((int64_t)sum * sum) >> 10);
 }
 
@@ -78,8 +77,8 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
-                sse, &sum, vpx_get32x32var_avx2, 32);
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
+                vpx_get32x32var_avx2, 32);
   return *sse - (((int64_t)sum * sum) >> 12);
 }
 
@@ -87,79 +86,58 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
-                sse, &sum, vpx_get32x32var_avx2, 32);
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
+                vpx_get32x32var_avx2, 32);
   return *sse - (((int64_t)sum * sum) >> 11);
 }
 
 unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
-                                             int height,
-                                             unsigned int *sse);
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
-                                                 int src_stride,
-                                                 int x_offset,
-                                                 int y_offset,
-                                                 const uint8_t *dst,
-                                                 int dst_stride,
-                                                 const uint8_t *sec,
-                                                 int sec_stride,
-                                                 int height,
-                                                 unsigned int *sseptr);
+                                             int height, unsigned int *sse);
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sseptr);
 
 unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
+                                              int src_stride, int x_offset,
+                                              int y_offset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                                  y_offset, dst, dst_stride,
-                                                  64, &sse1);
+  const int se1 = vpx_sub_pixel_variance32xh_avx2(
+      src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
   unsigned int sse2;
-  const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
-                                                  x_offset, y_offset,
-                                                  dst + 32, dst_stride,
-                                                  64, &sse2);
+  const int se2 =
+      vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
+                                      dst + 32, dst_stride, 64, &sse2);
   const int se = se1 + se2;
   *sse = sse1 + sse2;
   return *sse - (((int64_t)se * se) >> 12);
 }
 
 unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
+                                              int src_stride, int x_offset,
+                                              int y_offset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
-  const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                                 y_offset, dst, dst_stride,
-                                                 32, sse);
+  const int se = vpx_sub_pixel_variance32xh_avx2(
+      src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
   return *sse - (((int64_t)se * se) >> 10);
 }
 
-unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sse,
-                                                  const uint8_t *sec) {
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                      y_offset, dst, dst_stride,
-                                                      sec, 64, 64, &sse1);
+  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(
+      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
   unsigned int sse2;
-  const int se2 =
-    vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
-                                        y_offset, dst + 32, dst_stride,
-                                        sec + 32, 64, 64, &sse2);
+  const int se2 = vpx_sub_pixel_avg_variance32xh_avx2(
+      src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
+      64, 64, &sse2);
   const int se = se1 + se2;
 
   *sse = sse1 + sse2;
@@ -167,17 +145,11 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
   return *sse - (((int64_t)se * se) >> 12);
 }
 
-unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sse,
-                                                  const uint8_t *sec) {
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
   // Process 32 elements in parallel.
-  const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                     y_offset, dst, dst_stride,
-                                                     sec, 32, 32, sse);
+  const int se = vpx_sub_pixel_avg_variance32xh_avx2(
+      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
   return *sse - (((int64_t)se * se) >> 10);
 }
diff --git a/vpx_dsp/x86/variance_impl_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c
index b289e9a0c74840d652aa74c38cf0ca0c0d2cf05a..f26eda3e5c05d0c63d7bd5921ebcb435383f9c24 100644
--- a/vpx_dsp/x86/variance_impl_avx2.c
+++ b/vpx_dsp/x86/variance_impl_avx2.c
@@ -13,6 +13,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
+/* clang-format off */
 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
@@ -31,289 +32,275 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
 };
+/* clang-format on */
+
+void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
+                          const unsigned char *ref_ptr, int recon_stride,
+                          unsigned int *SSE, int *Sum) {
+  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+  __m256i ref_expand_high, madd_low, madd_high;
+  unsigned int i, src_2strides, ref_2strides;
+  __m256i zero_reg = _mm256_set1_epi16(0);
+  __m256i sum_ref_src = _mm256_set1_epi16(0);
+  __m256i madd_ref_src = _mm256_set1_epi16(0);
+
+  // processing two strides in a 256 bit register reducing the number
+  // of loop stride by half (comparing to the sse2 code)
+  src_2strides = source_stride << 1;
+  ref_2strides = recon_stride << 1;
+  for (i = 0; i < 8; i++) {
+    src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
+    src = _mm256_inserti128_si256(
+        src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
+
+    ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
+    ref = _mm256_inserti128_si256(
+        ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
+
+    // expanding to 16 bit each lane
+    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+
+    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+
+    // src-ref
+    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+
+    // madd low (src - ref)
+    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+
+    // add high to low
+    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+
+    // madd high (src - ref)
+    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+
+    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+
+    // add high to low
+    madd_ref_src =
+        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
+
+    src_ptr += src_2strides;
+    ref_ptr += ref_2strides;
+  }
 
+  {
+    __m128i sum_res, madd_res;
+    __m128i expand_sum_low, expand_sum_high, expand_sum;
+    __m128i expand_madd_low, expand_madd_high, expand_madd;
+    __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
 
-void vpx_get16x16var_avx2(const unsigned char *src_ptr,
-                          int source_stride,
-                          const unsigned char *ref_ptr,
-                          int recon_stride,
-                          unsigned int *SSE,
-                          int *Sum) {
-    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-    __m256i ref_expand_high, madd_low, madd_high;
-    unsigned int i, src_2strides, ref_2strides;
-    __m256i zero_reg = _mm256_set1_epi16(0);
-    __m256i sum_ref_src = _mm256_set1_epi16(0);
-    __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-    // processing two strides in a 256 bit register reducing the number
-    // of loop stride by half (comparing to the sse2 code)
-    src_2strides = source_stride << 1;
-    ref_2strides = recon_stride << 1;
-    for (i = 0; i < 8; i++) {
-        src = _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i const *) (src_ptr)));
-        src = _mm256_inserti128_si256(src,
-              _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1);
-
-        ref =_mm256_castsi128_si256(
-             _mm_loadu_si128((__m128i const *) (ref_ptr)));
-        ref = _mm256_inserti128_si256(ref,
-              _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1);
-
-        // expanding to 16 bit each lane
-        src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-        src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-        ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-        ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-        // src-ref
-        src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-        src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-        // madd low (src - ref)
-        madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-        // add high to low
-        src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-        // madd high (src - ref)
-        madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-        sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-        // add high to low
-        madd_ref_src = _mm256_add_epi32(madd_ref_src,
-                       _mm256_add_epi32(madd_low, madd_high));
-
-        src_ptr+= src_2strides;
-        ref_ptr+= ref_2strides;
-    }
-
-    {
-        __m128i sum_res, madd_res;
-        __m128i expand_sum_low, expand_sum_high, expand_sum;
-        __m128i expand_madd_low, expand_madd_high, expand_madd;
-        __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-        // extract the low lane and add it to the high lane
-        sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                                _mm256_extractf128_si256(sum_ref_src, 1));
+    // extract the low lane and add it to the high lane
+    sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
+                            _mm256_extractf128_si256(sum_ref_src, 1));
 
-        madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                                 _mm256_extractf128_si256(madd_ref_src, 1));
+    madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
+                             _mm256_extractf128_si256(madd_ref_src, 1));
 
-        // padding each 2 bytes with another 2 zeroed bytes
-        expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg),
-                                            sum_res);
-        expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg),
-                                             sum_res);
+    // padding each 2 bytes with another 2 zeroed bytes
+    expand_sum_low =
+        _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
+    expand_sum_high =
+        _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
 
-        // shifting the sign 16 bits right
-        expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-        expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
+    // shifting the sign 16 bits right
+    expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
+    expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
 
-        expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
+    expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
 
-        // expand each 32 bits of the madd result to 64 bits
-        expand_madd_low = _mm_unpacklo_epi32(madd_res,
-                          _mm256_castsi256_si128(zero_reg));
-        expand_madd_high = _mm_unpackhi_epi32(madd_res,
-                           _mm256_castsi256_si128(zero_reg));
+    // expand each 32 bits of the madd result to 64 bits
+    expand_madd_low =
+        _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
+    expand_madd_high =
+        _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
 
-        expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
+    expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
 
-        ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum,
-                            _mm256_castsi256_si128(zero_reg));
-        ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum,
-                             _mm256_castsi256_si128(zero_reg));
+    ex_expand_sum_low =
+        _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
+    ex_expand_sum_high =
+        _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
 
-        ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+    ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
 
-        // shift 8 bytes eight
-        madd_res = _mm_srli_si128(expand_madd, 8);
-        sum_res = _mm_srli_si128(ex_expand_sum, 8);
+    // shift 8 bytes eight
+    madd_res = _mm_srli_si128(expand_madd, 8);
+    sum_res = _mm_srli_si128(ex_expand_sum, 8);
 
-        madd_res = _mm_add_epi32(madd_res, expand_madd);
-        sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
+    madd_res = _mm_add_epi32(madd_res, expand_madd);
+    sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
 
-        *((int*)SSE)= _mm_cvtsi128_si32(madd_res);
+    *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
 
-        *((int*)Sum)= _mm_cvtsi128_si32(sum_res);
-    }
+    *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
+  }
 }
 
-void vpx_get32x32var_avx2(const unsigned char *src_ptr,
-                          int source_stride,
-                          const unsigned char *ref_ptr,
-                          int recon_stride,
-                          unsigned int *SSE,
-                          int *Sum) {
-    __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-    __m256i ref_expand_high, madd_low, madd_high;
-    unsigned int i;
-    __m256i zero_reg = _mm256_set1_epi16(0);
-    __m256i sum_ref_src = _mm256_set1_epi16(0);
-    __m256i madd_ref_src = _mm256_set1_epi16(0);
+void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
+                          const unsigned char *ref_ptr, int recon_stride,
+                          unsigned int *SSE, int *Sum) {
+  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
+  __m256i ref_expand_high, madd_low, madd_high;
+  unsigned int i;
+  __m256i zero_reg = _mm256_set1_epi16(0);
+  __m256i sum_ref_src = _mm256_set1_epi16(0);
+  __m256i madd_ref_src = _mm256_set1_epi16(0);
 
-    // processing 32 elements in parallel
-    for (i = 0; i < 16; i++) {
-       src = _mm256_loadu_si256((__m256i const *) (src_ptr));
+  // processing 32 elements in parallel
+  for (i = 0; i < 16; i++) {
+    src = _mm256_loadu_si256((__m256i const *)(src_ptr));
 
-       ref = _mm256_loadu_si256((__m256i const *) (ref_ptr));
+    ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
 
-       // expanding to 16 bit each lane
-       src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-       src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
+    // expanding to 16 bit each lane
+    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
+    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
 
-       ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-       ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
+    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
+    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
 
-       // src-ref
-       src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-       src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
+    // src-ref
+    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
+    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
 
-       // madd low (src - ref)
-       madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
+    // madd low (src - ref)
+    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
 
-       // add high to low
-       src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
+    // add high to low
+    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
 
-       // madd high (src - ref)
-       madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
+    // madd high (src - ref)
+    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
 
-       sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
+    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
 
-       // add high to low
-       madd_ref_src = _mm256_add_epi32(madd_ref_src,
-                      _mm256_add_epi32(madd_low, madd_high));
+    // add high to low
+    madd_ref_src =
+        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
 
-       src_ptr+= source_stride;
-       ref_ptr+= recon_stride;
-    }
+    src_ptr += source_stride;
+    ref_ptr += recon_stride;
+  }
 
-    {
-      __m256i expand_sum_low, expand_sum_high, expand_sum;
-      __m256i expand_madd_low, expand_madd_high, expand_madd;
-      __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
+  {
+    __m256i expand_sum_low, expand_sum_high, expand_sum;
+    __m256i expand_madd_low, expand_madd_high, expand_madd;
+    __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
 
-      // padding each 2 bytes with another 2 zeroed bytes
-      expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-      expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
+    // padding each 2 bytes with another 2 zeroed bytes
+    expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
+    expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
 
-      // shifting the sign 16 bits right
-      expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-      expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
+    // shifting the sign 16 bits right
+    expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
+    expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
 
-      expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
+    expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
 
-      // expand each 32 bits of the madd result to 64 bits
-      expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-      expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
+    // expand each 32 bits of the madd result to 64 bits
+    expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
+    expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
 
-      expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
+    expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
 
-      ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-      ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
+    ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
+    ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
 
-      ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
+    ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
 
-      // shift 8 bytes eight
-      madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-      sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
+    // shift 8 bytes eight
+    madd_ref_src = _mm256_srli_si256(expand_madd, 8);
+    sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
 
-      madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-      sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
+    madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
+    sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
 
-      // extract the low lane and the high lane and add the results
-      *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-      _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
+    // extract the low lane and the high lane and add the results
+    *((int *)SSE) =
+        _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
 
-      *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-      _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-    }
+    *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
+                    _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
+  }
 }
 
-#define FILTER_SRC(filter) \
-  /* filter the source */ \
+#define FILTER_SRC(filter)                               \
+  /* filter the source */                                \
   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-  \
-  /* add 8 to source */ \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
-  \
-  /* divide source by 16 */ \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+                                                         \
+  /* add 8 to source */                                  \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
+                                                         \
+  /* divide source by 16 */                              \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
 
-#define MERGE_WITH_SRC(src_reg, reg) \
+#define MERGE_WITH_SRC(src_reg, reg)               \
   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
 
-#define LOAD_SRC_DST \
-  /* load source and destination */ \
-  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
+#define LOAD_SRC_DST                                    \
+  /* load source and destination */                     \
+  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
 
-#define AVG_NEXT_SRC(src_reg, size_stride) \
-  src_next_reg = _mm256_loadu_si256((__m256i const *) \
-                                   (src + size_stride)); \
-  /* average between current and next stride source */ \
+#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+  /* average between current and next stride source */                     \
   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
 
-#define MERGE_NEXT_SRC(src_reg, size_stride) \
-  src_next_reg = _mm256_loadu_si256((__m256i const *) \
-                                   (src + size_stride)); \
+#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
   MERGE_WITH_SRC(src_reg, src_next_reg)
 
-#define CALC_SUM_SSE_INSIDE_LOOP \
-  /* expand each byte to 2 bytes */ \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
-  /* source - dest */ \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
-  /* caculate sum */ \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+#define CALC_SUM_SSE_INSIDE_LOOP                          \
+  /* expand each byte to 2 bytes */                       \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
+  /* source - dest */                                     \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
+  /* caculate sum */                                      \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */ \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+  /* calculate sse */                                     \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
 
 // final calculation to sum and sse
-#define CALC_SUM_AND_SSE \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
-  \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
-  \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
-  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
-                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+#define CALC_SUM_AND_SSE                                                   \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
+                                                                           \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
+                                                                           \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
 
-
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
-                                             int src_stride,
-                                             int x_offset,
-                                             int y_offset,
-                                             const uint8_t *dst,
-                                             int dst_stride,
-                                             int height,
-                                             unsigned int *sse) {
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse) {
   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
   __m256i zero_reg;
@@ -325,66 +312,66 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
   // x_offset = 0 and y_offset = 0
   if (x_offset == 0) {
     if (y_offset == 0) {
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         // expend each byte to 2 bytes
         MERGE_WITH_SRC(src_reg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
-    // x_offset = 0 and y_offset = 8
+      // x_offset = 0 and y_offset = 8
     } else if (y_offset == 8) {
       __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, src_stride)
         // expend each byte to 2 bytes
         MERGE_WITH_SRC(src_reg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
-    // x_offset = 0 and y_offset = bilin interpolation
+      // x_offset = 0 and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg;
 
       y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
       pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, src_stride)
         FILTER_SRC(filter)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
     }
-  // x_offset = 8  and y_offset = 0
+    // x_offset = 8  and y_offset = 0
   } else if (x_offset == 8) {
     if (y_offset == 0) {
       __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_reg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
-    // x_offset = 8  and y_offset = 8
+      // x_offset = 8  and y_offset = 8
     } else if (y_offset == 8) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         src_avg = src_reg;
-        src+= src_stride;
+        src += src_stride;
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
         // average between previous average to current average
@@ -393,92 +380,92 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
         MERGE_WITH_SRC(src_avg, zero_reg)
         // save current source average
         CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
+        dst += dst_stride;
       }
-    // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 8  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
       pw8 = _mm256_set1_epi16(8);
       // load source and another source starting from the next
       // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         // save current source average
         src_avg = src_reg;
-        src+= src_stride;
+        src += src_stride;
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
         MERGE_WITH_SRC(src_avg, src_reg)
         FILTER_SRC(filter)
         CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
+        dst += dst_stride;
       }
     }
-  // x_offset = bilin interpolation and y_offset = 0
+    // x_offset = bilin interpolation and y_offset = 0
   } else {
     if (y_offset == 0) {
       __m256i filter, pw8, src_next_reg;
       x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
       pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
-    // x_offset = bilin interpolation and y_offset = 8
+      // x_offset = bilin interpolation and y_offset = 8
     } else if (y_offset == 8) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
       pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       MERGE_NEXT_SRC(src_reg, 1)
       FILTER_SRC(filter)
       // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
         // average between previous pack to the current
         src_pack = _mm256_avg_epu8(src_pack, src_reg);
         MERGE_WITH_SRC(src_pack, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
         src_pack = src_reg;
-        dst+= dst_stride;
+        dst += dst_stride;
       }
-    // x_offset = bilin interpolation and y_offset = bilin interpolation
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
     } else {
       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
-      xfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + x_offset));
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
       y_offset <<= 5;
-      yfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + y_offset));
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
       pw8 = _mm256_set1_epi16(8);
       // load source and another source starting from the next
       // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       MERGE_NEXT_SRC(src_reg, 1)
 
       FILTER_SRC(xfilter)
       // convert each 16 bit to 8 bit to each low and high lane source
       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
+      for (i = 0; i < height; i++) {
+        src += src_stride;
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(xfilter)
@@ -489,7 +476,7 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
         FILTER_SRC(yfilter)
         src_pack = src_reg;
         CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
+        dst += dst_stride;
       }
     }
   }
@@ -497,16 +484,10 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
   return sum;
 }
 
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
-                                             int src_stride,
-                                             int x_offset,
-                                             int y_offset,
-                                             const uint8_t *dst,
-                                             int dst_stride,
-                                             const uint8_t *sec,
-                                             int sec_stride,
-                                             int height,
-                                             unsigned int *sse) {
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sse) {
   __m256i sec_reg;
   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
@@ -519,190 +500,190 @@ unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
   // x_offset = 0 and y_offset = 0
   if (x_offset == 0) {
     if (y_offset == 0) {
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
+        sec += sec_stride;
         // expend each byte to 2 bytes
         MERGE_WITH_SRC(src_reg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
     } else if (y_offset == 8) {
       __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
+        sec += sec_stride;
         // expend each byte to 2 bytes
         MERGE_WITH_SRC(src_reg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
-    // x_offset = 0 and y_offset = bilin interpolation
+      // x_offset = 0 and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg;
 
       y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-                 (bilinear_filters_avx2 + y_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
       pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, src_stride)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
+        sec += sec_stride;
         MERGE_WITH_SRC(src_reg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
     }
-  // x_offset = 8  and y_offset = 0
+    // x_offset = 8  and y_offset = 0
   } else if (x_offset == 8) {
     if (y_offset == 0) {
       __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
+        sec += sec_stride;
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_reg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
-    // x_offset = 8  and y_offset = 8
+      // x_offset = 8  and y_offset = 8
     } else if (y_offset == 8) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         // save current source average
         src_avg = src_reg;
-        src+= src_stride;
+        src += src_stride;
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
         // average between previous average to current average
         src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec+= sec_stride;
+        sec += sec_stride;
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_avg, zero_reg)
         CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
+        dst += dst_stride;
       }
-    // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 8  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
       pw8 = _mm256_set1_epi16(8);
       // load source and another source starting from the next
       // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         // save current source average
         src_avg = src_reg;
-        src+= src_stride;
+        src += src_stride;
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
         MERGE_WITH_SRC(src_avg, src_reg)
         FILTER_SRC(filter)
         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_avg, zero_reg)
-        sec+= sec_stride;
+        sec += sec_stride;
         CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
+        dst += dst_stride;
       }
     }
-  // x_offset = bilin interpolation and y_offset = 0
+    // x_offset = bilin interpolation and y_offset = 0
   } else {
     if (y_offset == 0) {
       __m256i filter, pw8, src_next_reg;
       x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
       pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
+      for (i = 0; i < height; i++) {
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         MERGE_WITH_SRC(src_reg, zero_reg)
-        sec+= sec_stride;
+        sec += sec_stride;
         CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
+        src += src_stride;
+        dst += dst_stride;
       }
-    // x_offset = bilin interpolation and y_offset = 8
+      // x_offset = bilin interpolation and y_offset = 8
     } else if (y_offset == 8) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
       pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       MERGE_NEXT_SRC(src_reg, 1)
       FILTER_SRC(filter)
       // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
         // average between previous pack to the current
         src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec+= sec_stride;
+        sec += sec_stride;
         MERGE_WITH_SRC(src_pack, zero_reg)
         src_pack = src_reg;
         CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
+        dst += dst_stride;
       }
-    // x_offset = bilin interpolation and y_offset = bilin interpolation
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
     } else {
       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
-      xfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + x_offset));
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
       y_offset <<= 5;
-      yfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + y_offset));
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
       pw8 = _mm256_set1_epi16(8);
       // load source and another source starting from the next
       // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
       MERGE_NEXT_SRC(src_reg, 1)
 
       FILTER_SRC(xfilter)
       // convert each 16 bit to 8 bit to each low and high lane source
       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
+      for (i = 0; i < height; i++) {
+        src += src_stride;
         LOAD_SRC_DST
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(xfilter)
@@ -712,13 +693,13 @@ unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
         // filter the source
         FILTER_SRC(yfilter)
         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         MERGE_WITH_SRC(src_pack, zero_reg)
         src_pack = src_reg;
-        sec+= sec_stride;
+        sec += sec_stride;
         CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
+        dst += dst_stride;
       }
     }
   }
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index e76c1a287285bb058611dc93cd32b9d57e01941b..e40eed7fea6639127d1c7027533294d6051d142f 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -15,9 +15,9 @@
 
 #include "vpx_ports/mem.h"
 
-typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
-                                const unsigned char *ref, int ref_stride,
-                                unsigned int *sse, int *sum);
+typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
+                               const unsigned char *ref, int ref_stride,
+                               unsigned int *sse, int *sum);
 
 unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
@@ -31,11 +31,12 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
 
   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  return  _mm_cvtsi128_si32(vsum);
+  return _mm_cvtsi128_si32(vsum);
 }
 
-#define READ64(p, stride, i) \
-  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+#define READ64(p, stride, i)                                  \
+  _mm_unpacklo_epi8(                                          \
+      _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
       _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
 
 static void get4x4var_sse2(const uint8_t *src, int src_stride,
@@ -57,32 +58,31 @@ static void get4x4var_sse2(const uint8_t *src, int src_stride,
   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
 
   // sse
-  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
-                       _mm_madd_epi16(diff1, diff1));
+  vsum =
+      _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
   *sse = _mm_cvtsi128_si32(vsum);
 }
 
-void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
-                        const uint8_t *ref, int ref_stride,
-                        unsigned int *sse, int *sum) {
+void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
+                        int ref_stride, unsigned int *sse, int *sum) {
   const __m128i zero = _mm_setzero_si128();
   __m128i vsum = _mm_setzero_si128();
   __m128i vsse = _mm_setzero_si128();
   int i;
 
   for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(ref + i * ref_stride)), zero);
+    const __m128i src0 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
     const __m128i diff0 = _mm_sub_epi16(src0, ref0);
 
-    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i src1 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
+    const __m128i ref1 = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
     const __m128i diff1 = _mm_sub_epi16(src1, ref1);
 
     vsum = _mm_add_epi16(vsum, diff0);
@@ -104,8 +104,8 @@ void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
 }
 
 void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum) {
+                          const uint8_t *ref, int ref_stride, unsigned int *sse,
+                          int *sum) {
   const __m128i zero = _mm_setzero_si128();
   __m128i vsum = _mm_setzero_si128();
   __m128i vsse = _mm_setzero_si128();
@@ -135,8 +135,8 @@ void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
   // sum
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
-             (int16_t)_mm_extract_epi16(vsum, 1);
+  *sum =
+      (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
 
   // sse
   vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
@@ -144,10 +144,9 @@ void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
   *sse = _mm_cvtsi128_si32(vsse);
 }
 
-
 static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride,
-                          int w, int h, unsigned int *sse, int *sum,
+                          const unsigned char *ref, int ref_stride, int w,
+                          int h, unsigned int *sse, int *sum,
                           getNxMvar_fn_t var_fn, int block_size) {
   int i, j;
 
@@ -158,8 +157,8 @@ static void variance_sse2(const unsigned char *src, int src_stride,
     for (j = 0; j < w; j += block_size) {
       unsigned int sse0;
       int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
       *sse += sse0;
       *sum += sum0;
     }
@@ -178,8 +177,8 @@ unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
                                   const uint8_t *ref, int ref_stride,
                                   unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
-                sse, &sum, get4x4var_sse2, 4);
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
+                get4x4var_sse2, 4);
   return *sse - ((sum * sum) >> 5);
 }
 
@@ -187,8 +186,8 @@ unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
                                   const uint8_t *ref, int ref_stride,
                                   unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
-                sse, &sum, get4x4var_sse2, 4);
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
+                get4x4var_sse2, 4);
   return *sse - ((sum * sum) >> 5);
 }
 
@@ -204,8 +203,8 @@ unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
                                    const unsigned char *ref, int ref_stride,
                                    unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
-                sse, &sum, vpx_get8x8var_sse2, 8);
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
+                vpx_get8x8var_sse2, 8);
   return *sse - ((sum * sum) >> 7);
 }
 
@@ -213,8 +212,8 @@ unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
                                    const unsigned char *ref, int ref_stride,
                                    unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
-                sse, &sum, vpx_get8x8var_sse2, 8);
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
+                vpx_get8x8var_sse2, 8);
   return *sse - ((sum * sum) >> 7);
 }
 
@@ -230,8 +229,8 @@ unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
-                sse, &sum, vpx_get16x16var_sse2, 16);
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
+                vpx_get16x16var_sse2, 16);
   return *sse - (((int64_t)sum * sum) >> 10);
 }
 
@@ -239,8 +238,8 @@ unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
-                sse, &sum, vpx_get16x16var_sse2, 16);
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
+                vpx_get16x16var_sse2, 16);
   return *sse - (((int64_t)sum * sum) >> 9);
 }
 
@@ -248,8 +247,8 @@ unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
-                sse, &sum, vpx_get16x16var_sse2, 16);
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
+                vpx_get16x16var_sse2, 16);
   return *sse - (((int64_t)sum * sum) >> 9);
 }
 
@@ -257,8 +256,8 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
-                sse, &sum, vpx_get16x16var_sse2, 16);
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
+                vpx_get16x16var_sse2, 16);
   return *sse - (((int64_t)sum * sum) >> 12);
 }
 
@@ -266,8 +265,8 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
-                sse, &sum, vpx_get16x16var_sse2, 16);
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
+                vpx_get16x16var_sse2, 16);
   return *sse - (((int64_t)sum * sum) >> 11);
 }
 
@@ -275,8 +274,8 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
-                sse, &sum, vpx_get16x16var_sse2, 16);
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
+                vpx_get16x16var_sse2, 16);
   return *sse - (((int64_t)sum * sum) >> 11);
 }
 
@@ -310,17 +309,14 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
 
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt) \
-  int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
-                                          ptrdiff_t src_stride, \
-                                          int x_offset, int y_offset, \
-                                          const uint8_t *dst, \
-                                          ptrdiff_t dst_stride, \
-                                          int height, unsigned int *sse, \
-                                          void *unused0, void *unused)
+#define DECL(w, opt)                                                           \
+  int vpx_sub_pixel_variance##w##xh_##opt(                                     \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
+      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+      void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-  DECL(4, opt1); \
-  DECL(8, opt1); \
+  DECL(4, opt1);          \
+  DECL(8, opt1);          \
   DECL(16, opt1)
 
 DECLS(sse2, sse2);
@@ -328,59 +324,52 @@ DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
-unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                     int src_stride, \
-                                                     int x_offset, \
-                                                     int y_offset, \
-                                                     const uint8_t *dst, \
-                                                     int dst_stride, \
-                                                     unsigned int *sse_ptr) { \
-  unsigned int sse; \
-  int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                y_offset, dst, dst_stride, \
-                                                h, &sse, NULL, NULL); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                   x_offset, y_offset, \
-                                                   dst + 16, dst_stride, \
-                                                   h, &sse2, NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 32, dst_stride, \
-                                                 h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 48, dst_stride, \
-                                                 h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
-}
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                        \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
+    unsigned int sse;                                                          \
+    int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
+                                                  y_offset, dst, dst_stride,   \
+                                                  h, &sse, NULL, NULL);        \
+    if (w > wf) {                                                              \
+      unsigned int sse2;                                                       \
+      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                          \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          &sse2, NULL, NULL);                                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2));                 \
+  }
 
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-FN(16,  8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (int32_t), (int32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (int32_t), (int32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (int32_t), (int32_t)); \
-FN(4,   8,  4, 2, 3, opt1, (int32_t), (int32_t)); \
-FN(4,   4,  4, 2, 2, opt1, (int32_t), (int32_t))
+#define FNS(opt1, opt2)                              \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
+  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
+  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
+  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
+  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
 
 FNS(sse2, sse2);
 FNS(ssse3, ssse3);
@@ -389,84 +378,69 @@ FNS(ssse3, ssse3);
 #undef FN
 
 // The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
-                                            ptrdiff_t src_stride, \
-                                            int x_offset, int y_offset, \
-                                            const uint8_t *dst, \
-                                            ptrdiff_t dst_stride, \
-                                            const uint8_t *sec, \
-                                            ptrdiff_t sec_stride, \
-                                            int height, unsigned int *sse, \
-                                            void *unused0, void *unused)
+#define DECL(w, opt)                                                        \
+  int vpx_sub_pixel_avg_variance##w##xh_##opt(                              \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
+      void *unused)
 #define DECLS(opt1, opt2) \
-DECL(4, opt1); \
-DECL(8, opt1); \
-DECL(16, opt1)
+  DECL(4, opt1);          \
+  DECL(8, opt1);          \
+  DECL(16, opt1)
 
 DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
-unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                         int src_stride, \
-                                                         int x_offset, \
-                                                         int y_offset, \
-                                                         const uint8_t *dst, \
-                                                         int dst_stride, \
-                                                         unsigned int *sseptr, \
-                                                         const uint8_t *sec) { \
-  unsigned int sse; \
-  int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                    y_offset, dst, dst_stride, \
-                                                    sec, w, h, &sse, NULL, \
-                                                    NULL); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst + 16, dst_stride, \
-                                                       sec + 16, w, h, &sse2, \
-                                                       NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 32, dst_stride, \
-                                                     sec + 32, w, h, &sse2, \
-                                                     NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 48, dst_stride, \
-                                                     sec + 48, w, h, &sse2, \
-                                                     NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sseptr = sse; \
-  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
-}
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(                    \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
+      const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
+      const uint8_t *sec) {                                                    \
+    unsigned int sse;                                                          \
+    int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                         \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      unsigned int sse2;                                                       \
+      int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                      \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            sec + 32, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            sec + 48, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sseptr = sse;                                                             \
+    return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2));                 \
+  }
 
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-FN(16,  8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (uint32_t), (int32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (uint32_t), (int32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (uint32_t), (int32_t)); \
-FN(4,   8,  4, 2, 3, opt1, (uint32_t), (int32_t)); \
-FN(4,   4,  4, 2, 2, opt1, (uint32_t), (int32_t))
+#define FNS(opt1, opt2)                              \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
+  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
+  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
+  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
+  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
+  FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
@@ -474,216 +448,215 @@ FNS(ssse3, ssse3);
 #undef FNS
 #undef FN
 
-void vpx_upsampled_pred_sse2(uint8_t *comp_pred,
-                             int width, int height,
-                             const uint8_t *ref,  int ref_stride) {
-    int i, j;
-    int stride = ref_stride << 3;
-
-    if (width >= 16) {
-      // read 16 points at one time
-      for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j+= 16) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
-          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
-          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
-          __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
-          __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
-          __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
-          __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
-          __m128i t0, t1, t2, t3;
-
-          t0 = _mm_unpacklo_epi8(s0, s1);
-          s1 = _mm_unpackhi_epi8(s0, s1);
-          t1 = _mm_unpacklo_epi8(s2, s3);
-          s3 = _mm_unpackhi_epi8(s2, s3);
-          t2 = _mm_unpacklo_epi8(s4, s5);
-          s5 = _mm_unpackhi_epi8(s4, s5);
-          t3 = _mm_unpacklo_epi8(s6, s7);
-          s7 = _mm_unpackhi_epi8(s6, s7);
-
-          s0 = _mm_unpacklo_epi8(t0, s1);
-          s2 = _mm_unpacklo_epi8(t1, s3);
-          s4 = _mm_unpacklo_epi8(t2, s5);
-          s6 = _mm_unpacklo_epi8(t3, s7);
-          s0 = _mm_unpacklo_epi32(s0, s2);
-          s4 = _mm_unpacklo_epi32(s4, s6);
-          s0 = _mm_unpacklo_epi64(s0, s4);
-
-          _mm_storeu_si128((__m128i *)(comp_pred), s0);
-          comp_pred += 16;
-          ref += 16 * 8;
-        }
-        ref += stride - (width << 3);
+void vpx_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
+                             const uint8_t *ref, int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  if (width >= 16) {
+    // read 16 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j += 16) {
+        __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+        __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+        __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+        __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+        __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+        __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+        __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+        __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+        __m128i t0, t1, t2, t3;
+
+        t0 = _mm_unpacklo_epi8(s0, s1);
+        s1 = _mm_unpackhi_epi8(s0, s1);
+        t1 = _mm_unpacklo_epi8(s2, s3);
+        s3 = _mm_unpackhi_epi8(s2, s3);
+        t2 = _mm_unpacklo_epi8(s4, s5);
+        s5 = _mm_unpackhi_epi8(s4, s5);
+        t3 = _mm_unpacklo_epi8(s6, s7);
+        s7 = _mm_unpackhi_epi8(s6, s7);
+
+        s0 = _mm_unpacklo_epi8(t0, s1);
+        s2 = _mm_unpacklo_epi8(t1, s3);
+        s4 = _mm_unpacklo_epi8(t2, s5);
+        s6 = _mm_unpacklo_epi8(t3, s7);
+        s0 = _mm_unpacklo_epi32(s0, s2);
+        s4 = _mm_unpacklo_epi32(s4, s6);
+        s0 = _mm_unpacklo_epi64(s0, s4);
+
+        _mm_storeu_si128((__m128i *)(comp_pred), s0);
+        comp_pred += 16;
+        ref += 16 * 8;
       }
-    } else if (width >= 8) {
-      // read 8 points at one time
-      for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j+= 8) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
-          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
-          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
-          __m128i t0, t1;
-
-          t0 = _mm_unpacklo_epi8(s0, s1);
-          s1 = _mm_unpackhi_epi8(s0, s1);
-          t1 = _mm_unpacklo_epi8(s2, s3);
-          s3 = _mm_unpackhi_epi8(s2, s3);
-
-          s0 = _mm_unpacklo_epi8(t0, s1);
-          s2 = _mm_unpacklo_epi8(t1, s3);
-          s0 = _mm_unpacklo_epi32(s0, s2);
-
-          _mm_storel_epi64((__m128i *)(comp_pred), s0);
-          comp_pred += 8;
-          ref += 8 * 8;
-        }
-        ref += stride - (width << 3);
+      ref += stride - (width << 3);
+    }
+  } else if (width >= 8) {
+    // read 8 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j += 8) {
+        __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+        __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+        __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+        __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+        __m128i t0, t1;
+
+        t0 = _mm_unpacklo_epi8(s0, s1);
+        s1 = _mm_unpackhi_epi8(s0, s1);
+        t1 = _mm_unpacklo_epi8(s2, s3);
+        s3 = _mm_unpackhi_epi8(s2, s3);
+
+        s0 = _mm_unpacklo_epi8(t0, s1);
+        s2 = _mm_unpacklo_epi8(t1, s3);
+        s0 = _mm_unpacklo_epi32(s0, s2);
+
+        _mm_storel_epi64((__m128i *)(comp_pred), s0);
+        comp_pred += 8;
+        ref += 8 * 8;
       }
-    } else {
-      // read 4 points at one time
-      for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j+= 4) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
-          __m128i t0;
-
-          t0 = _mm_unpacklo_epi8(s0, s1);
-          s1 = _mm_unpackhi_epi8(s0, s1);
-          s0 = _mm_unpacklo_epi8(t0, s1);
-
-          *(int *)comp_pred = _mm_cvtsi128_si32(s0);
-          comp_pred += 4;
-          ref += 4 * 8;
-        }
-        ref += stride - (width << 3);
+      ref += stride - (width << 3);
+    }
+  } else {
+    // read 4 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j += 4) {
+        __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+        __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+        __m128i t0;
+
+        t0 = _mm_unpacklo_epi8(s0, s1);
+        s1 = _mm_unpackhi_epi8(s0, s1);
+        s0 = _mm_unpacklo_epi8(t0, s1);
+
+        *(int *)comp_pred = _mm_cvtsi128_si32(s0);
+        comp_pred += 4;
+        ref += 4 * 8;
       }
+      ref += stride - (width << 3);
     }
+  }
 }
 
 void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
-                                      int width, int height,
-                                      const uint8_t *ref,  int ref_stride) {
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi16(1);
-    int i, j;
-    int stride = ref_stride << 3;
-
-    if (width >= 16) {
-      // read 16 points at one time
-      for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j+= 16) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
-          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
-          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
-          __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
-          __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
-          __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
-          __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
-          __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-          __m128i p1;
-          __m128i t0, t1, t2, t3;
-
-          t0 = _mm_unpacklo_epi8(s0, s1);
-          s1 = _mm_unpackhi_epi8(s0, s1);
-          t1 = _mm_unpacklo_epi8(s2, s3);
-          s3 = _mm_unpackhi_epi8(s2, s3);
-          t2 = _mm_unpacklo_epi8(s4, s5);
-          s5 = _mm_unpackhi_epi8(s4, s5);
-          t3 = _mm_unpacklo_epi8(s6, s7);
-          s7 = _mm_unpackhi_epi8(s6, s7);
-
-          s0 = _mm_unpacklo_epi8(t0, s1);
-          s2 = _mm_unpacklo_epi8(t1, s3);
-          s4 = _mm_unpacklo_epi8(t2, s5);
-          s6 = _mm_unpacklo_epi8(t3, s7);
-
-          s0 = _mm_unpacklo_epi32(s0, s2);
-          s4 = _mm_unpacklo_epi32(s4, s6);
-          s0 = _mm_unpacklo_epi8(s0, zero);
-          s4 = _mm_unpacklo_epi8(s4, zero);
-
-          p1 = _mm_unpackhi_epi8(p0, zero);
-          p0 = _mm_unpacklo_epi8(p0, zero);
-          p0 = _mm_adds_epu16(s0, p0);
-          p1 = _mm_adds_epu16(s4, p1);
-          p0 = _mm_adds_epu16(p0, one);
-          p1 = _mm_adds_epu16(p1, one);
-
-          p0 = _mm_srli_epi16(p0, 1);
-          p1 = _mm_srli_epi16(p1, 1);
-          p0 = _mm_packus_epi16(p0, p1);
-
-          _mm_storeu_si128((__m128i *)(comp_pred), p0);
-          comp_pred += 16;
-          pred += 16;
-          ref += 16 * 8;
-        }
-        ref += stride - (width << 3);
+                                      int width, int height, const uint8_t *ref,
+                                      int ref_stride) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  int i, j;
+  int stride = ref_stride << 3;
+
+  if (width >= 16) {
+    // read 16 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j += 16) {
+        __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+        __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+        __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+        __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+        __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+        __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+        __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+        __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+        __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+        __m128i p1;
+        __m128i t0, t1, t2, t3;
+
+        t0 = _mm_unpacklo_epi8(s0, s1);
+        s1 = _mm_unpackhi_epi8(s0, s1);
+        t1 = _mm_unpacklo_epi8(s2, s3);
+        s3 = _mm_unpackhi_epi8(s2, s3);
+        t2 = _mm_unpacklo_epi8(s4, s5);
+        s5 = _mm_unpackhi_epi8(s4, s5);
+        t3 = _mm_unpacklo_epi8(s6, s7);
+        s7 = _mm_unpackhi_epi8(s6, s7);
+
+        s0 = _mm_unpacklo_epi8(t0, s1);
+        s2 = _mm_unpacklo_epi8(t1, s3);
+        s4 = _mm_unpacklo_epi8(t2, s5);
+        s6 = _mm_unpacklo_epi8(t3, s7);
+
+        s0 = _mm_unpacklo_epi32(s0, s2);
+        s4 = _mm_unpacklo_epi32(s4, s6);
+        s0 = _mm_unpacklo_epi8(s0, zero);
+        s4 = _mm_unpacklo_epi8(s4, zero);
+
+        p1 = _mm_unpackhi_epi8(p0, zero);
+        p0 = _mm_unpacklo_epi8(p0, zero);
+        p0 = _mm_adds_epu16(s0, p0);
+        p1 = _mm_adds_epu16(s4, p1);
+        p0 = _mm_adds_epu16(p0, one);
+        p1 = _mm_adds_epu16(p1, one);
+
+        p0 = _mm_srli_epi16(p0, 1);
+        p1 = _mm_srli_epi16(p1, 1);
+        p0 = _mm_packus_epi16(p0, p1);
+
+        _mm_storeu_si128((__m128i *)(comp_pred), p0);
+        comp_pred += 16;
+        pred += 16;
+        ref += 16 * 8;
       }
-    } else if (width >= 8) {
-      // read 8 points at one time
-      for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j+= 8) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
-          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
-          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
-          __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
-          __m128i t0, t1;
-
-          t0 = _mm_unpacklo_epi8(s0, s1);
-          s1 = _mm_unpackhi_epi8(s0, s1);
-          t1 = _mm_unpacklo_epi8(s2, s3);
-          s3 = _mm_unpackhi_epi8(s2, s3);
-
-          s0 = _mm_unpacklo_epi8(t0, s1);
-          s2 = _mm_unpacklo_epi8(t1, s3);
-          s0 = _mm_unpacklo_epi32(s0, s2);
-          s0 = _mm_unpacklo_epi8(s0, zero);
-
-          p0 = _mm_unpacklo_epi8(p0, zero);
-          p0 = _mm_adds_epu16(s0, p0);
-          p0 = _mm_adds_epu16(p0, one);
-          p0 = _mm_srli_epi16(p0, 1);
-          p0 = _mm_packus_epi16(p0, zero);
-
-          _mm_storel_epi64((__m128i *)(comp_pred), p0);
-          comp_pred += 8;
-          pred += 8;
-          ref += 8 * 8;
-        }
-        ref += stride - (width << 3);
+      ref += stride - (width << 3);
+    }
+  } else if (width >= 8) {
+    // read 8 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j += 8) {
+        __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+        __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+        __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+        __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+        __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
+        __m128i t0, t1;
+
+        t0 = _mm_unpacklo_epi8(s0, s1);
+        s1 = _mm_unpackhi_epi8(s0, s1);
+        t1 = _mm_unpacklo_epi8(s2, s3);
+        s3 = _mm_unpackhi_epi8(s2, s3);
+
+        s0 = _mm_unpacklo_epi8(t0, s1);
+        s2 = _mm_unpacklo_epi8(t1, s3);
+        s0 = _mm_unpacklo_epi32(s0, s2);
+        s0 = _mm_unpacklo_epi8(s0, zero);
+
+        p0 = _mm_unpacklo_epi8(p0, zero);
+        p0 = _mm_adds_epu16(s0, p0);
+        p0 = _mm_adds_epu16(p0, one);
+        p0 = _mm_srli_epi16(p0, 1);
+        p0 = _mm_packus_epi16(p0, zero);
+
+        _mm_storel_epi64((__m128i *)(comp_pred), p0);
+        comp_pred += 8;
+        pred += 8;
+        ref += 8 * 8;
       }
-    } else {
-      // read 4 points at one time
-      for (i = 0; i < height; i++) {
-        for (j = 0; j < width; j+= 4) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
-          __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred);
-          __m128i t0;
-
-          t0 = _mm_unpacklo_epi8(s0, s1);
-          s1 = _mm_unpackhi_epi8(s0, s1);
-          s0 = _mm_unpacklo_epi8(t0, s1);
-          s0 = _mm_unpacklo_epi8(s0, zero);
-
-          p0 = _mm_unpacklo_epi8(p0, zero);
-          p0 = _mm_adds_epu16(s0, p0);
-          p0 = _mm_adds_epu16(p0, one);
-          p0 = _mm_srli_epi16(p0, 1);
-          p0 = _mm_packus_epi16(p0, zero);
-
-          *(int *)comp_pred = _mm_cvtsi128_si32(p0);
-          comp_pred += 4;
-          pred += 4;
-          ref += 4 * 8;
-        }
-        ref += stride - (width << 3);
+      ref += stride - (width << 3);
+    }
+  } else {
+    // read 4 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j += 4) {
+        __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+        __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+        __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred);
+        __m128i t0;
+
+        t0 = _mm_unpacklo_epi8(s0, s1);
+        s1 = _mm_unpackhi_epi8(s0, s1);
+        s0 = _mm_unpacklo_epi8(t0, s1);
+        s0 = _mm_unpacklo_epi8(s0, zero);
+
+        p0 = _mm_unpacklo_epi8(p0, zero);
+        p0 = _mm_adds_epu16(s0, p0);
+        p0 = _mm_adds_epu16(p0, one);
+        p0 = _mm_srli_epi16(p0, 1);
+        p0 = _mm_packus_epi16(p0, zero);
+
+        *(int *)comp_pred = _mm_cvtsi128_si32(p0);
+        comp_pred += 4;
+        pred += 4;
+        ref += 4 * 8;
       }
+      ref += stride - (width << 3);
     }
+  }
 }
diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c
index 422b0fc422d68e0a22d81aec174a4ae5750169de..727d9d1156ead8b04aff6cfa0850763299eb61a4 100644
--- a/vpx_dsp/x86/vpx_asm_stubs.c
+++ b/vpx_dsp/x86/vpx_asm_stubs.c
@@ -75,7 +75,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
 //                             const int16_t *filter_y, int y_step_q4,
 //                             int w, int h);
 FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_ , sse2);
+FUN_CONV_2D(avg_, sse2);
 
 #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
@@ -157,6 +157,6 @@ HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
 //                                    const int16_t *filter_y, int y_step_q4,
 //                                    int w, int h, int bd);
 HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_ , sse2);
+HIGH_FUN_CONV_2D(avg_, sse2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
 #endif  // HAVE_SSE2
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 01cf4354a0938aee25c27bd192099729d16f123b..6d53b8705ddf9573efee76d392e8402614b50ab8 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -36,35 +36,32 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
 };
 
 #if defined(__clang__)
-# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
-    (defined(__APPLE__) && defined(__apple_build_version__) && \
-        ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
-            (__clang_major__ == 5 && __clang_minor__ == 0)))
-
-#  define MM256_BROADCASTSI128_SI256(x) \
-       _mm_broadcastsi128_si256((__m128i const *)&(x))
-# else  // clang > 3.3, and not 5.0 on macosx.
-#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif  // clang <= 3.3
+#if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+    (defined(__APPLE__) && defined(__apple_build_version__) &&               \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||                      \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
+
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *) & (x))
+#else  // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // clang <= 3.3
 #elif defined(__GNUC__)
-# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#  define MM256_BROADCASTSI128_SI256(x) \
-       _mm_broadcastsi128_si256((__m128i const *)&(x))
-# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-# else  // gcc > 4.7
-#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-# endif  // gcc <= 4.6
-#else  // !(gcc || clang)
-# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *) & (x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else  // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // gcc <= 4.6
+#else   // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__
 
-static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
+static void vpx_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i filtersReg;
   __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
@@ -78,26 +75,22 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
   filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to 8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
   // have the same data in both lanes of a 256 bit register
   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
 
   // duplicate only the first 16 bits (first and second byte)
   // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x100u));
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
   // duplicate only the second 16 bits (third and forth byte)
   // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32,
-                  _mm256_set1_epi16(0x302u));
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
   // duplicate only the third 16 bits (fifth and sixth byte)
   // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x504u));
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
   // duplicate only the forth 16 bits (seventh and eighth byte)
   // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x706u));
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
 
   filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
   filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
@@ -107,17 +100,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
   // multiple the size of the source and destination stride by two
   src_stride = src_pixels_per_line << 1;
   dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i-=2) {
+  for (i = output_height; i > 1; i -= 2) {
     // load the 2 strides of source
-    srcReg32b1 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                 _mm_loadu_si128((const __m128i *)
-                 (src_ptr+src_pixels_per_line-3)), 1);
+    srcReg32b1 =
+        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
+    srcReg32b1 = _mm256_inserti128_si256(
+        srcReg32b1,
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
+        1);
 
     // filter the source buffer
-    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
@@ -127,28 +121,29 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
 
     // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
 
     // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
-                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b1_1 = _mm256_adds_epi16(
+        srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                 _mm_loadu_si128((const __m128i *)
-                 (src_ptr+src_pixels_per_line+5)), 1);
+    srcReg32b2 =
+        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
+    srcReg32b2 = _mm256_inserti128_si256(
+        srcReg32b2,
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
+        1);
 
     // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
-                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b1_1 = _mm256_adds_epi16(
+        srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
 
     // filter the source buffer
     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
@@ -162,19 +157,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
 
     // filter the source buffer
-    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
 
     // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
-                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
-                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
+    srcRegFilt32b2_1 = _mm256_adds_epi16(
+        srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcRegFilt32b2_1 = _mm256_adds_epi16(
+        srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
 
     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
 
@@ -187,19 +181,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
     // result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
-                                           srcRegFilt32b2_1);
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
 
-    src_ptr+=src_stride;
+    src_ptr += src_stride;
 
     // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr,
-    _mm256_castsi256_si128(srcRegFilt32b1_1));
+    _mm_store_si128((__m128i *)output_ptr,
+                    _mm256_castsi256_si128(srcRegFilt32b1_1));
 
     // save the next 16 bits
-    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
-    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
-    output_ptr+=dst_stride;
+    _mm_store_si128((__m128i *)(output_ptr + output_pitch),
+                    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
@@ -211,83 +204,74 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
-                    _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
-                  _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
-                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
 
     // add and saturate the results together
     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
 
     // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
-                 _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
-                 _mm256_castsi256_si128(filt3Reg));
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
 
     // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
 
     // reading the next 16 bytes
     // (part of it was being read by earlier read)
     srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
 
     // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
 
     // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
-                    _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt4Reg));
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
-                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(forthFilters));
+    srcRegFilt2_1 =
+        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
 
     // add and saturate the results together
     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
 
     // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
-                  _mm256_castsi256_si128(filt3Reg));
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
-                  _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
 
     // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
 
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-                    _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
 
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-                    _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
 
     // shift by 7 bit each 16 bit
     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
@@ -299,16 +283,13 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
 
     // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
   }
 }
 
-static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pitch,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t out_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
+static void vpx_filter_block1d16_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i filtersReg;
   __m256i addFilterReg64;
   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
@@ -323,60 +304,56 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
   filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the
   // same data in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
   // have the same data in both lanes of a 256 bit register
   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
 
   // duplicate only the first 16 bits (first and second byte)
   // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x100u));
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
   // duplicate only the second 16 bits (third and forth byte)
   // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32,
-                  _mm256_set1_epi16(0x302u));
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
   // duplicate only the third 16 bits (fifth and sixth byte)
   // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x504u));
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
   // duplicate only the forth 16 bits (seventh and eighth byte)
   // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32,
-                 _mm256_set1_epi16(0x706u));
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pitch << 1;
   dst_stride = out_pitch << 1;
 
   // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr)));
+  srcReg32b1 =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
   srcReg32b2 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
   srcReg32b3 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
   srcReg32b4 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
   srcReg32b5 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
   srcReg32b6 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
   srcReg32b7 = _mm256_castsi128_si256(
-               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
 
   // have each consecutive loads on the same 256 register
   srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-               _mm256_castsi256_si128(srcReg32b2), 1);
+                                       _mm256_castsi256_si128(srcReg32b2), 1);
   srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-               _mm256_castsi256_si128(srcReg32b3), 1);
+                                       _mm256_castsi256_si128(srcReg32b3), 1);
   srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-               _mm256_castsi256_si128(srcReg32b4), 1);
+                                       _mm256_castsi256_si128(srcReg32b4), 1);
   srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-               _mm256_castsi256_si128(srcReg32b5), 1);
+                                       _mm256_castsi256_si128(srcReg32b5), 1);
   srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-               _mm256_castsi256_si128(srcReg32b6), 1);
+                                       _mm256_castsi256_si128(srcReg32b6), 1);
   srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-               _mm256_castsi256_si128(srcReg32b7), 1);
+                                       _mm256_castsi256_si128(srcReg32b7), 1);
 
   // merge every two consecutive registers except the last one
   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
@@ -394,89 +371,87 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
   // save
   srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
 
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
+
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
+    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
 
-  for (i = output_height; i > 1; i-=2) {
-     // load the last 2 loads of 16 bytes and have every two
-     // consecutive loads in the same 256 bit register
-     srcReg32b8 = _mm256_castsi128_si256(
-     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-     _mm256_castsi256_si128(srcReg32b8), 1);
-     srcReg32b9 = _mm256_castsi128_si256(
-     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-     _mm256_castsi256_si128(srcReg32b9), 1);
-
-     // merge every two consecutive registers
-     // save
-     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-     // add and saturate the results together
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-     // add and saturate the results together
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-     // multiply 2 adjacent elements with the filter and add the result
-     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-     // add and saturate the results together
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
-     // shift by 7 bit each 16 bit
-     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
-     // shrink to 8 bit each 16 bits, the first lane contain the first
-     // convolve result and the second lane contain the second convolve
-     // result
-     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
-     src_ptr+=src_stride;
-
-     // save 16 bytes
-     _mm_store_si128((__m128i*)output_ptr,
-     _mm256_castsi256_si128(srcReg32b1));
-
-     // save the next 16 bits
-     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
-     _mm256_extractf128_si256(srcReg32b1, 1));
-
-     output_ptr+=dst_stride;
-
-     // save part of the registers for next strides
-     srcReg32b10 = srcReg32b11;
-     srcReg32b1 = srcReg32b3;
-     srcReg32b11 = srcReg32b2;
-     srcReg32b3 = srcReg32b5;
-     srcReg32b2 = srcReg32b4;
-     srcReg32b5 = srcReg32b7;
-     srcReg32b7 = srcReg32b9;
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+    src_ptr += src_stride;
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+
+    // save the next 16 bits
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch),
+                    _mm256_extractf128_si256(srcReg32b1, 1));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b1 = srcReg32b3;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b3 = srcReg32b5;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b5 = srcReg32b7;
+    srcReg32b7 = srcReg32b9;
   }
   if (i > 0) {
     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
@@ -485,55 +460,53 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
     srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
 
     // merge the last 2 results together
-    srcRegFilt4 = _mm_unpacklo_epi8(
-                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 = _mm_unpackhi_epi8(
-                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 =
+        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                  _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
-                  _mm256_castsi256_si128(forthFilters));
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
     srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                  _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
-                  _mm256_castsi256_si128(forthFilters));
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 =
+        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
 
     // add and saturate the results together
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
 
-
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                  _mm256_castsi256_si128(secondFilters));
+                                    _mm256_castsi256_si128(secondFilters));
     srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                  _mm256_castsi256_si128(secondFilters));
+                                    _mm256_castsi256_si128(secondFilters));
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                  _mm256_castsi256_si128(thirdFilters));
+                                    _mm256_castsi256_si128(thirdFilters));
     srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                  _mm256_castsi256_si128(thirdFilters));
+                                    _mm256_castsi256_si128(thirdFilters));
 
     // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
 
     // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
 
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-                  _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
-                  _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
 
     // shift by 7 bit each 16 bit
     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
@@ -545,7 +518,7 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
 
     // save 16 bytes
-    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
   }
 }
 
@@ -575,10 +548,10 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
 #define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
-#define vpx_filter_block1d8_v2_avx2  vpx_filter_block1d8_v2_ssse3
-#define vpx_filter_block1d8_h2_avx2  vpx_filter_block1d8_h2_ssse3
-#define vpx_filter_block1d4_v2_avx2  vpx_filter_block1d4_v2_ssse3
-#define vpx_filter_block1d4_h2_avx2  vpx_filter_block1d4_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
 //                                const int16_t *filter_x, int x_step_q4,
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 69cd6967a9ddfd62a0be6cc12fae6c334cef7a7c..36af4dd132da85dbf0a73fe938b8603fd5265ec3 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -48,23 +48,20 @@ filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
 
-void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
+void vpx_filter_block1d4_h8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
   __m128i addFilterReg64, filtersReg, srcReg, minReg;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
   filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
 
   // duplicate only the first 16 bits in the filter into the first lane
   firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
@@ -78,23 +75,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
 
   // loading the local filters
-  shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);
+  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
   shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
 
   for (i = 0; i < output_height; i++) {
     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);
+    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
 
     // extract the higher half of the lane
-    srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
+    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
 
     minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
 
@@ -110,21 +107,18 @@ void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
 
     // shrink to 8 bit each 16 bits
     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr+=src_pixels_per_line;
+    src_ptr += src_pixels_per_line;
 
     // save only 4 bytes
-    *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
 
-    output_ptr+=output_pitch;
+    output_ptr += output_pitch;
   }
 }
 
-void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pixels_per_line,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t output_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
+void vpx_filter_block1d8_h8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
@@ -136,7 +130,7 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
   filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
 
   // duplicate only the first 16 bits (first and second byte)
   // across 128 bit register
@@ -160,16 +154,16 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
+    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
 
     // filter the source buffer
-    srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
@@ -179,7 +173,7 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
     minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
 
-    srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
@@ -190,21 +184,18 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
     // shrink to 8 bit each 16 bits
     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
 
-    src_ptr+=src_pixels_per_line;
+    src_ptr += src_pixels_per_line;
 
     // save only 8 bytes
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
 
-    output_ptr+=output_pitch;
+    output_ptr += output_pitch;
   }
 }
 
-void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
-                                         ptrdiff_t src_pitch,
-                                         uint8_t *output_ptr,
-                                         ptrdiff_t out_pitch,
-                                         uint32_t output_height,
-                                         const int16_t *filter) {
+void vpx_filter_block1d8_v8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i addFilterReg64, filtersReg, minReg;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
@@ -217,7 +208,7 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
   filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
-  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
 
   // duplicate only the first 16 bits in the filter
   firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
@@ -269,7 +260,7 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
     // shrink to 8 bit each 16 bits
     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
 
-    src_ptr+=src_pitch;
+    src_ptr += src_pitch;
 
     // shift down a row
     srcReg1 = srcReg2;
@@ -281,9 +272,9 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
     srcReg7 = srcReg8;
 
     // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
 
-    output_ptr+=out_pitch;
+    output_ptr += out_pitch;
   }
 }
 
@@ -339,32 +330,33 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
             ssse3);
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,           \
-                      out0, out1, out2, out3, out4, out5, out6, out7) { \
-  const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-  const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-  const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-  const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                        \
-  const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-  const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-  const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-  const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                        \
-  const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-  const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-  const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-  const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                        \
-  out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-  out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-  out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-  out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-  out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-  out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-  out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-  out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-}
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                      out2, out3, out4, out5, out6, out7)                 \
+  {                                                                       \
+    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
+    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
+    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
+    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
+                                                                          \
+    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
+    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
+    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
+    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
+                                                                          \
+    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
+    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
+    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
+    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
+                                                                          \
+    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
+    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
+    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
+    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
+    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
+    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
+    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
+    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
+  }
 
 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
                                   uint8_t *dst, const int16_t *x_filter) {
@@ -420,7 +412,7 @@ static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i*)dst, temp);
+  _mm_storel_epi64((__m128i *)dst, temp);
 }
 
 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
@@ -436,23 +428,22 @@ static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
   G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
   H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
 
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H,
-                A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i*)dst, A);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H);
+  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
+
+  _mm_storel_epi64((__m128i *)dst, A);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
+  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
 }
 
 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters,
-                                    int x0_q4, int x_step_q4, int w, int h) {
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int w, int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
@@ -523,7 +514,7 @@ static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
   // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
   const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
   // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
   // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
   const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
   // 02 03 12 13 22 23 32 33
@@ -565,16 +556,16 @@ static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
   C = _mm_srli_si128(A, 8);
   D = _mm_srli_si128(A, 12);
 
-  *(int *)(dst) =  _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) =  _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) =  _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) =  _mm_cvtsi128_si32(D);
+  *(int *)(dst) = _mm_cvtsi128_si32(A);
+  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
+  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
+  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
 }
 
 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters,
-                                    int x0_q4, int x_step_q4, int w, int h) {
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int w, int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
@@ -648,8 +639,8 @@ static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
 
 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters,
-                                   int y0_q4, int y_step_q4, int w, int h) {
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int w, int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -705,13 +696,13 @@ static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i*)dst, temp);
+  _mm_storel_epi64((__m128i *)dst, temp);
 }
 
 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters,
-                                   int y0_q4, int y_step_q4, int w, int h) {
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int w, int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -794,15 +785,15 @@ static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     // result
     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
     src_ptr += 16;
-     // save 16 bytes convolve result
-    _mm_store_si128((__m128i*)&dst[i], temp_hi);
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i *)&dst[i], temp_hi);
   }
 }
 
 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters,
-                                    int y0_q4, int y_step_q4, int w, int h) {
+                                    const InterpKernel *y_filters, int y0_q4,
+                                    int y_step_q4, int w, int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -822,11 +813,9 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
 
 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters,
-                             int x0_q4, int x_step_q4,
-                             const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4,
-                             int w, int h) {
+                             const InterpKernel *const x_filters, int x0_q4,
+                             int x_step_q4, const InterpKernel *const y_filters,
+                             int y0_q4, int y_step_q4, int w, int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -851,38 +840,26 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride,
-                            temp,
-                            MAX_SB_SIZE,
-                            x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
+                            x_step_q4, w, intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride,
-                            temp,
-                            MAX_SB_SIZE,
-                            x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
+                            x_step_q4, w, intermediate_height);
   }
 
   if (w >= 16) {
     scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE,
-                            dst,
-                            dst_stride,
-                            y_filters, y0_q4, y_step_q4, w, h);
+                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+                            y_step_q4, w, h);
   } else if (w == 8) {
     scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE,
-                           dst,
-                           dst_stride,
-                           y_filters, y0_q4, y_step_q4, w, h);
+                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+                           y_step_q4, w, h);
   } else {
     scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE,
-                           dst,
-                           dst_stride,
-                           y_filters, y0_q4, y_step_q4, w, h);
+                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+                           y_step_q4, w, h);
   }
 }
 
@@ -896,10 +873,9 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
   return (int)((const InterpKernel *)(intptr_t)f - base);
 }
 
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                         uint8_t *dst, ptrdiff_t dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   const InterpKernel *const filters_x = get_filter_base(filter_x);
   const int x0_q4 = get_filter_offset(filter_x, filters_x);
@@ -907,9 +883,8 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
   const InterpKernel *const filters_y = get_filter_base(filter_y);
   const int y0_q4 = get_filter_offset(filter_y, filters_y);
 
-  scaledconvolve2d(src, src_stride, dst, dst_stride,
-                   filters_x, x0_q4, x_step_q4,
-                   filters_y, y0_q4, y_step_q4, w, h);
+  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
 }
 
 // void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
@@ -923,4 +898,4 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                              const int16_t *filter_y, int y_step_q4,
 //                              int w, int h);
 FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
+FUN_CONV_2D(avg_, ssse3);
diff --git a/vpxdec.c b/vpxdec.c
index 67b3b51c7e1bdf1e00c9c3457b5be607870cb5fb..cd6d7ed8a5724a962259e823909afe431d53c403 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -47,117 +47,124 @@ struct VpxDecInputContext {
   struct WebmInputContext *webm_ctx;
 };
 
-static const arg_def_t looparg = ARG_DEF(
-    NULL, "loops", 1, "Number of times to decode the file");
-static const arg_def_t codecarg = ARG_DEF(
-    NULL, "codec", 1, "Codec to use");
-static const arg_def_t use_yv12 = ARG_DEF(
-    NULL, "yv12", 0, "Output raw YV12 frames");
-static const arg_def_t use_i420 = ARG_DEF(
-    NULL, "i420", 0, "Output raw I420 frames");
-static const arg_def_t flipuvarg = ARG_DEF(
-    NULL, "flipuv", 0, "Flip the chroma planes in the output");
-static const arg_def_t rawvideo = ARG_DEF(
-    NULL, "rawvideo", 0, "Output raw YUV frames");
-static const arg_def_t noblitarg = ARG_DEF(
-    NULL, "noblit", 0, "Don't process the decoded frames");
-static const arg_def_t progressarg = ARG_DEF(
-    NULL, "progress", 0, "Show progress after each frame decodes");
-static const arg_def_t limitarg = ARG_DEF(
-    NULL, "limit", 1, "Stop decoding after n frames");
-static const arg_def_t skiparg = ARG_DEF(
-    NULL, "skip", 1, "Skip the first n input frames");
-static const arg_def_t postprocarg = ARG_DEF(
-    NULL, "postproc", 0, "Postprocess decoded frames");
-static const arg_def_t summaryarg = ARG_DEF(
-    NULL, "summary", 0, "Show timing summary");
-static const arg_def_t outputfile = ARG_DEF(
-    "o", "output", 1, "Output file name pattern (see below)");
-static const arg_def_t threadsarg = ARG_DEF(
-    "t", "threads", 1, "Max threads to use");
-static const arg_def_t frameparallelarg = ARG_DEF(
-    NULL, "frame-parallel", 0, "Frame parallel decode");
-static const arg_def_t verbosearg = ARG_DEF(
-    "v", "verbose", 0, "Show version string");
-static const arg_def_t error_concealment = ARG_DEF(
-    NULL, "error-concealment", 0, "Enable decoder error-concealment");
-static const arg_def_t scalearg = ARG_DEF(
-    "S", "scale", 0, "Scale output frames uniformly");
-static const arg_def_t continuearg = ARG_DEF(
-    "k", "keep-going", 0, "(debug) Continue decoding after error");
-static const arg_def_t fb_arg = ARG_DEF(
-    NULL, "frame-buffers", 1, "Number of frame buffers to use");
-static const arg_def_t md5arg = ARG_DEF(
-    NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
+static const arg_def_t looparg =
+    ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
+static const arg_def_t use_yv12 =
+    ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Output raw I420 frames");
+static const arg_def_t flipuvarg =
+    ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output");
+static const arg_def_t rawvideo =
+    ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames");
+static const arg_def_t noblitarg =
+    ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames");
+static const arg_def_t progressarg =
+    ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes");
+static const arg_def_t limitarg =
+    ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t skiparg =
+    ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t postprocarg =
+    ARG_DEF(NULL, "postproc", 0, "Postprocess decoded frames");
+static const arg_def_t summaryarg =
+    ARG_DEF(NULL, "summary", 0, "Show timing summary");
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
+static const arg_def_t threadsarg =
+    ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t frameparallelarg =
+    ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode");
+static const arg_def_t verbosearg =
+    ARG_DEF("v", "verbose", 0, "Show version string");
+static const arg_def_t error_concealment =
+    ARG_DEF(NULL, "error-concealment", 0, "Enable decoder error-concealment");
+static const arg_def_t scalearg =
+    ARG_DEF("S", "scale", 0, "Scale output frames uniformly");
+static const arg_def_t continuearg =
+    ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error");
+static const arg_def_t fb_arg =
+    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t md5arg =
+    ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
 #if CONFIG_VP9_HIGHBITDEPTH
-static const arg_def_t outbitdeptharg = ARG_DEF(
-    NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
+static const arg_def_t outbitdeptharg =
+    ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
 #if CONFIG_EXT_TILE
-static const arg_def_t tiler = ARG_DEF(
-    NULL, "tile-row", 1, "Row index of tile to decode "
-                          "(-1 for all rows)");
-static const arg_def_t tilec = ARG_DEF(
-    NULL, "tile-column", 1, "Column index of tile to decode "
-                            "(-1 for all columns)");
+static const arg_def_t tiler = ARG_DEF(NULL, "tile-row", 1,
+                                       "Row index of tile to decode "
+                                       "(-1 for all rows)");
+static const arg_def_t tilec = ARG_DEF(NULL, "tile-column", 1,
+                                       "Column index of tile to decode "
+                                       "(-1 for all columns)");
 #endif  // CONFIG_EXT_TILE
 
-static const arg_def_t *all_args[] = {
-  &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
-  &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &frameparallelarg, &verbosearg, &scalearg, &fb_arg,
-  &md5arg, &error_concealment, &continuearg,
+static const arg_def_t *all_args[] = { &codecarg,
+                                       &use_yv12,
+                                       &use_i420,
+                                       &flipuvarg,
+                                       &rawvideo,
+                                       &noblitarg,
+                                       &progressarg,
+                                       &limitarg,
+                                       &skiparg,
+                                       &postprocarg,
+                                       &summaryarg,
+                                       &outputfile,
+                                       &threadsarg,
+                                       &frameparallelarg,
+                                       &verbosearg,
+                                       &scalearg,
+                                       &fb_arg,
+                                       &md5arg,
+                                       &error_concealment,
+                                       &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
-  &outbitdeptharg,
+                                       &outbitdeptharg,
 #endif
 #if CONFIG_EXT_TILE
-  &tiler, &tilec,
+                                       &tiler,
+                                       &tilec,
 #endif  // CONFIG_EXT_TILE
-  NULL
-};
+                                       NULL };
 
 #if CONFIG_LIBYUV
 static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
-                                  FilterModeEnum mode) {
+                               FilterModeEnum mode) {
 #if CONFIG_VP9_HIGHBITDEPTH
   if (src->fmt == VPX_IMG_FMT_I42016) {
     assert(dst->fmt == VPX_IMG_FMT_I42016);
-    return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y],
-                        src->stride[VPX_PLANE_Y]/2,
-                        (uint16_t*)src->planes[VPX_PLANE_U],
-                        src->stride[VPX_PLANE_U]/2,
-                        (uint16_t*)src->planes[VPX_PLANE_V],
-                        src->stride[VPX_PLANE_V]/2,
-                        src->d_w, src->d_h,
-                        (uint16_t*)dst->planes[VPX_PLANE_Y],
-                        dst->stride[VPX_PLANE_Y]/2,
-                        (uint16_t*)dst->planes[VPX_PLANE_U],
-                        dst->stride[VPX_PLANE_U]/2,
-                        (uint16_t*)dst->planes[VPX_PLANE_V],
-                        dst->stride[VPX_PLANE_V]/2,
-                        dst->d_w, dst->d_h,
-                        mode);
+    return I420Scale_16(
+        (uint16_t *)src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y] / 2,
+        (uint16_t *)src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U] / 2,
+        (uint16_t *)src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V] / 2,
+        src->d_w, src->d_h, (uint16_t *)dst->planes[VPX_PLANE_Y],
+        dst->stride[VPX_PLANE_Y] / 2, (uint16_t *)dst->planes[VPX_PLANE_U],
+        dst->stride[VPX_PLANE_U] / 2, (uint16_t *)dst->planes[VPX_PLANE_V],
+        dst->stride[VPX_PLANE_V] / 2, dst->d_w, dst->d_h, mode);
   }
 #endif
   assert(src->fmt == VPX_IMG_FMT_I420);
   assert(dst->fmt == VPX_IMG_FMT_I420);
   return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
                    src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U],
-                   src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V],
-                   src->d_w, src->d_h,
-                   dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y],
+                   src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V], src->d_w,
+                   src->d_h, dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y],
                    dst->planes[VPX_PLANE_U], dst->stride[VPX_PLANE_U],
-                   dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V],
-                   dst->d_w, dst->d_h,
-                   mode);
+                   dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V], dst->d_w,
+                   dst->d_h, mode);
 }
 #endif
 
 void usage_exit(void) {
   int i;
 
-  fprintf(stderr, "Usage: %s <options> filename\n\n"
-          "Options:\n", exec_name);
+  fprintf(stderr,
+          "Usage: %s <options> filename\n\n"
+          "Options:\n",
+          exec_name);
   arg_show_usage(stderr, all_args);
   fprintf(stderr,
           "\nOutput File Patterns:\n\n"
@@ -172,27 +179,25 @@ void usage_exit(void) {
           "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
           "\n\n  Pattern arguments are only supported in conjunction "
           "with the --yv12 and\n  --i420 options. If the -o option is "
-          "not specified, the output will be\n  directed to stdout.\n"
-         );
+          "not specified, the output will be\n  directed to stdout.\n");
   fprintf(stderr, "\nIncluded decoders:\n\n");
 
   for (i = 0; i < get_vpx_decoder_count(); ++i) {
     const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
-    fprintf(stderr, "    %-6s - %s\n",
-            decoder->name, vpx_codec_iface_name(decoder->codec_interface()));
+    fprintf(stderr, "    %-6s - %s\n", decoder->name,
+            vpx_codec_iface_name(decoder->codec_interface()));
   }
 
   exit(EXIT_FAILURE);
 }
 
-static int raw_read_frame(FILE *infile, uint8_t **buffer,
-                          size_t *bytes_read, size_t *buffer_size) {
+static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                          size_t *buffer_size) {
   char raw_hdr[RAW_FRAME_HDR_SZ];
   size_t frame_size = 0;
 
   if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile))
-      warn("Failed to read RAW frame size\n");
+    if (!feof(infile)) warn("Failed to read RAW frame size\n");
   } else {
     const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
     const size_t kFrameTooSmallThreshold = 256 * 1024;
@@ -239,13 +244,12 @@ static int read_frame(struct VpxDecInputContext *input, uint8_t **buf,
       return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer);
 #endif
     case FILE_TYPE_RAW:
-      return raw_read_frame(input->vpx_input_ctx->file,
-                            buf, bytes_in_buffer, buffer_size);
+      return raw_read_frame(input->vpx_input_ctx->file, buf, bytes_in_buffer,
+                            buffer_size);
     case FILE_TYPE_IVF:
-      return ivf_read_frame(input->vpx_input_ctx->file,
-                            buf, bytes_in_buffer, buffer_size);
-    default:
-      return 1;
+      return ivf_read_frame(input->vpx_input_ctx->file, buf, bytes_in_buffer,
+                            buffer_size);
+    default: return 1;
   }
 }
 
@@ -258,7 +262,7 @@ static void update_image_md5(const vpx_image_t *img, const int planes[3],
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
     const int w = vpx_img_plane_width(img, plane) *
-                ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
 
     for (y = 0; y < h; ++y) {
@@ -304,8 +308,8 @@ static int file_is_raw(struct VpxInputContext *input) {
     if (mem_get_le32(buf) < 256 * 1024 * 1024) {
       for (i = 0; i < get_vpx_decoder_count(); ++i) {
         const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
-        if (!vpx_codec_peek_stream_info(decoder->codec_interface(),
-                                        buf + 4, 32 - 4, &si)) {
+        if (!vpx_codec_peek_stream_info(decoder->codec_interface(), buf + 4,
+                                        32 - 4, &si)) {
           is_raw = 1;
           input->fourcc = decoder->fourcc;
           input->width = si.w;
@@ -324,13 +328,13 @@ static int file_is_raw(struct VpxInputContext *input) {
 
 static void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
   fprintf(stderr,
-          "%d decoded frames/%d showed frames in %"PRId64" us (%.2f fps)\r",
+          "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r",
           frame_in, frame_out, dx_time,
           (double)frame_out * 1000000.0 / (double)dx_time);
 }
 
 struct ExternalFrameBuffer {
-  uint8_t* data;
+  uint8_t *data;
   size_t size;
   int in_use;
 };
@@ -349,23 +353,19 @@ static int get_vp9_frame_buffer(void *cb_priv, size_t min_size,
   int i;
   struct ExternalFrameBufferList *const ext_fb_list =
       (struct ExternalFrameBufferList *)cb_priv;
-  if (ext_fb_list == NULL)
-    return -1;
+  if (ext_fb_list == NULL) return -1;
 
   // Find a free frame buffer.
   for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) {
-    if (!ext_fb_list->ext_fb[i].in_use)
-      break;
+    if (!ext_fb_list->ext_fb[i].in_use) break;
   }
 
-  if (i == ext_fb_list->num_external_frame_buffers)
-    return -1;
+  if (i == ext_fb_list->num_external_frame_buffers) return -1;
 
   if (ext_fb_list->ext_fb[i].size < min_size) {
     free(ext_fb_list->ext_fb[i].data);
     ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t));
-    if (!ext_fb_list->ext_fb[i].data)
-      return -1;
+    if (!ext_fb_list->ext_fb[i].data) return -1;
 
     ext_fb_list->ext_fb[i].size = min_size;
   }
@@ -406,47 +406,22 @@ static void generate_filename(const char *pattern, char *out, size_t q_len,
       /* parse the pattern */
       q[q_len - 1] = '\0';
       switch (p[1]) {
-        case 'w':
-          snprintf(q, q_len - 1, "%d", d_w);
-          break;
-        case 'h':
-          snprintf(q, q_len - 1, "%d", d_h);
-          break;
-        case '1':
-          snprintf(q, q_len - 1, "%d", frame_in);
-          break;
-        case '2':
-          snprintf(q, q_len - 1, "%02d", frame_in);
-          break;
-        case '3':
-          snprintf(q, q_len - 1, "%03d", frame_in);
-          break;
-        case '4':
-          snprintf(q, q_len - 1, "%04d", frame_in);
-          break;
-        case '5':
-          snprintf(q, q_len - 1, "%05d", frame_in);
-          break;
-        case '6':
-          snprintf(q, q_len - 1, "%06d", frame_in);
-          break;
-        case '7':
-          snprintf(q, q_len - 1, "%07d", frame_in);
-          break;
-        case '8':
-          snprintf(q, q_len - 1, "%08d", frame_in);
-          break;
-        case '9':
-          snprintf(q, q_len - 1, "%09d", frame_in);
-          break;
-        default:
-          die("Unrecognized pattern %%%c\n", p[1]);
-          break;
+        case 'w': snprintf(q, q_len - 1, "%d", d_w); break;
+        case 'h': snprintf(q, q_len - 1, "%d", d_h); break;
+        case '1': snprintf(q, q_len - 1, "%d", frame_in); break;
+        case '2': snprintf(q, q_len - 1, "%02d", frame_in); break;
+        case '3': snprintf(q, q_len - 1, "%03d", frame_in); break;
+        case '4': snprintf(q, q_len - 1, "%04d", frame_in); break;
+        case '5': snprintf(q, q_len - 1, "%05d", frame_in); break;
+        case '6': snprintf(q, q_len - 1, "%06d", frame_in); break;
+        case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
+        case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
+        case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
+        default: die("Unrecognized pattern %%%c\n", p[1]); break;
       }
 
       pat_len = strlen(q);
-      if (pat_len >= q_len - 1)
-        die("Output filename too long.\n");
+      if (pat_len >= q_len - 1) die("Output filename too long.\n");
       q += pat_len;
       p += 2;
       q_len -= pat_len;
@@ -459,8 +434,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len,
       else
         copy_len = next_pat - p;
 
-      if (copy_len >= q_len - 1)
-        die("Output filename too long.\n");
+      if (copy_len >= q_len - 1) die("Output filename too long.\n");
 
       memcpy(q, p, copy_len);
       q[copy_len] = '\0';
@@ -478,8 +452,7 @@ static int is_single_file(const char *outfile_pattern) {
     p = strchr(p, '%');
     if (p && p[1] >= '1' && p[1] <= '9')
       return 0;  // pattern contains sequence number, so it's not unique
-    if (p)
-      p++;
+    if (p) p++;
   } while (p);
 
   return 1;
@@ -488,8 +461,7 @@ static int is_single_file(const char *outfile_pattern) {
 static void print_md5(unsigned char digest[16], const char *filename) {
   int i;
 
-  for (i = 0; i < 16; ++i)
-    printf("%02x", digest[i]);
+  for (i = 0; i < 16; ++i) printf("%02x", digest[i]);
   printf("  %s\n", filename);
 }
 
@@ -499,8 +471,7 @@ static FILE *open_outfile(const char *name) {
     return stdout;
   } else {
     FILE *file = fopen(name, "wb");
-    if (!file)
-      fatal("Failed to open output file '%s'", name);
+    if (!file) fatal("Failed to open output file '%s'", name);
     return file;
   }
 }
@@ -509,62 +480,61 @@ static FILE *open_outfile(const char *name) {
 static int img_shifted_realloc_required(const vpx_image_t *img,
                                         const vpx_image_t *shifted,
                                         vpx_img_fmt_t required_fmt) {
-  return img->d_w != shifted->d_w ||
-         img->d_h != shifted->d_h ||
+  return img->d_w != shifted->d_w || img->d_h != shifted->d_h ||
          required_fmt != shifted->fmt;
 }
 #endif
 
 static int main_loop(int argc, const char **argv_) {
-  vpx_codec_ctx_t       decoder;
-  char                  *fn = NULL;
-  int                    i;
-  uint8_t               *buf = NULL;
-  size_t                 bytes_in_buffer = 0, buffer_size = 0;
-  FILE                  *infile;
-  int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
-  int                    do_md5 = 0, progress = 0, frame_parallel = 0;
-  int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
-  int                    arg_skip = 0;
-  int                    ec_enabled = 0;
-  int                    keep_going = 0;
+  vpx_codec_ctx_t decoder;
+  char *fn = NULL;
+  int i;
+  uint8_t *buf = NULL;
+  size_t bytes_in_buffer = 0, buffer_size = 0;
+  FILE *infile;
+  int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
+  int do_md5 = 0, progress = 0, frame_parallel = 0;
+  int stop_after = 0, postproc = 0, summary = 0, quiet = 1;
+  int arg_skip = 0;
+  int ec_enabled = 0;
+  int keep_going = 0;
   const VpxInterface *interface = NULL;
   const VpxInterface *fourcc_interface = NULL;
   uint64_t dx_time = 0;
-  struct arg               arg;
-  char                   **argv, **argi, **argj;
-
-  int                     single_file;
-  int                     use_y4m = 1;
-  int                     opt_yv12 = 0;
-  int                     opt_i420 = 0;
-  vpx_codec_dec_cfg_t     cfg = {0, 0, 0};
+  struct arg arg;
+  char **argv, **argi, **argj;
+
+  int single_file;
+  int use_y4m = 1;
+  int opt_yv12 = 0;
+  int opt_i420 = 0;
+  vpx_codec_dec_cfg_t cfg = { 0, 0, 0 };
 #if CONFIG_VP9_HIGHBITDEPTH
-  unsigned int            output_bit_depth = 0;
+  unsigned int output_bit_depth = 0;
 #endif
 #if CONFIG_EXT_TILE
-  int                     tile_row = -1;
-  int                     tile_col = -1;
+  int tile_row = -1;
+  int tile_col = -1;
 #endif  // CONFIG_EXT_TILE
-  int                     frames_corrupted = 0;
-  int                     dec_flags = 0;
-  int                     do_scale = 0;
-  vpx_image_t             *scaled_img = NULL;
+  int frames_corrupted = 0;
+  int dec_flags = 0;
+  int do_scale = 0;
+  vpx_image_t *scaled_img = NULL;
 #if CONFIG_VP9_HIGHBITDEPTH
-  vpx_image_t             *img_shifted = NULL;
+  vpx_image_t *img_shifted = NULL;
 #endif
-  int                     frame_avail, got_data, flush_decoder = 0;
-  int                     num_external_frame_buffers = 0;
-  struct ExternalFrameBufferList ext_fb_list = {0, NULL};
+  int frame_avail, got_data, flush_decoder = 0;
+  int num_external_frame_buffers = 0;
+  struct ExternalFrameBufferList ext_fb_list = { 0, NULL };
 
   const char *outfile_pattern = NULL;
-  char outfile_name[PATH_MAX] = {0};
+  char outfile_name[PATH_MAX] = { 0 };
   FILE *outfile = NULL;
 
   MD5Context md5_ctx;
   unsigned char md5_digest[16];
 
-  struct VpxDecInputContext input = {NULL, NULL};
+  struct VpxDecInputContext input = { NULL, NULL };
   struct VpxInputContext vpx_input_ctx;
 #if CONFIG_WEBM_IO
   struct WebmInputContext webm_ctx;
@@ -702,7 +672,8 @@ static int main_loop(int argc, const char **argv_) {
 
   if (use_y4m && !noblit) {
     if (!single_file) {
-      fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
+      fprintf(stderr,
+              "YUV4MPEG2 not supported with output patterns,"
               " try --i420 or --yv12 or --rawvideo.\n");
       return EXIT_FAILURE;
     }
@@ -710,7 +681,8 @@ static int main_loop(int argc, const char **argv_) {
 #if CONFIG_WEBM_IO
     if (vpx_input_ctx.file_type == FILE_TYPE_WEBM) {
       if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) {
-        fprintf(stderr, "Failed to guess framerate -- error parsing "
+        fprintf(stderr,
+                "Failed to guess framerate -- error parsing "
                 "webm file?\n");
         return EXIT_FAILURE;
       }
@@ -724,21 +696,19 @@ static int main_loop(int argc, const char **argv_) {
   else
     interface = fourcc_interface;
 
-  if (!interface)
-    interface = get_vpx_decoder_by_index(0);
+  if (!interface) interface = get_vpx_decoder_by_index(0);
 
   dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
               (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
               (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
-  if (vpx_codec_dec_init(&decoder, interface->codec_interface(),
-                         &cfg, dec_flags)) {
+  if (vpx_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
+                         dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
             vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
 
-  if (!quiet)
-    fprintf(stderr, "%s\n", decoder.name);
+  if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP10_DECODER && CONFIG_EXT_TILE
   if (strncmp(decoder.name, "WebM Project VP10", 17) == 0) {
@@ -756,11 +726,9 @@ static int main_loop(int argc, const char **argv_) {
   }
 #endif
 
-  if (arg_skip)
-    fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
+  if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
   while (arg_skip) {
-    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size))
-      break;
+    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
     arg_skip--;
   }
 
@@ -768,9 +736,9 @@ static int main_loop(int argc, const char **argv_) {
     ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
     ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
         num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
-    if (vpx_codec_set_frame_buffer_functions(
-            &decoder, get_vp9_frame_buffer, release_vp9_frame_buffer,
-            &ext_fb_list)) {
+    if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer,
+                                             release_vp9_frame_buffer,
+                                             &ext_fb_list)) {
       fprintf(stderr, "Failed to configure external frame buffers: %s\n",
               vpx_codec_error(&decoder));
       return EXIT_FAILURE;
@@ -782,10 +750,10 @@ static int main_loop(int argc, const char **argv_) {
 
   /* Decode file */
   while (frame_avail || got_data) {
-    vpx_codec_iter_t  iter = NULL;
-    vpx_image_t    *img;
+    vpx_codec_iter_t iter = NULL;
+    vpx_image_t *img;
     struct vpx_usec_timer timer;
-    int                   corrupted = 0;
+    int corrupted = 0;
 
     frame_avail = 0;
     if (!stop_after || frame_in < stop_after) {
@@ -795,16 +763,14 @@ static int main_loop(int argc, const char **argv_) {
 
         vpx_usec_timer_start(&timer);
 
-        if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer,
-                             NULL, 0)) {
+        if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer, NULL,
+                             0)) {
           const char *detail = vpx_codec_error_detail(&decoder);
-          warn("Failed to decode frame %d: %s",
-               frame_in, vpx_codec_error(&decoder));
+          warn("Failed to decode frame %d: %s", frame_in,
+               vpx_codec_error(&decoder));
 
-          if (detail)
-            warn("Additional information: %s", detail);
-          if (!keep_going)
-            goto fail;
+          if (detail) warn("Additional information: %s", detail);
+          if (!keep_going) goto fail;
         }
 
         vpx_usec_timer_mark(&timer);
@@ -837,17 +803,15 @@ static int main_loop(int argc, const char **argv_) {
     if (!frame_parallel &&
         vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
       warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
-      if (!keep_going)
-        goto fail;
+      if (!keep_going) goto fail;
     }
     frames_corrupted += corrupted;
 
-    if (progress)
-      show_progress(frame_in, frame_out, dx_time);
+    if (progress) show_progress(frame_in, frame_out, dx_time);
 
     if (!noblit && img) {
-      const int PLANES_YUV[] = {VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V};
-      const int PLANES_YVU[] = {VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U};
+      const int PLANES_YUV[] = { VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V };
+      const int PLANES_YVU[] = { VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U };
       const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
 
       if (do_scale) {
@@ -871,8 +835,8 @@ static int main_loop(int argc, const char **argv_) {
               render_height = render_size[1];
             }
           }
-          scaled_img = vpx_img_alloc(NULL, img->fmt, render_width,
-                                     render_height, 16);
+          scaled_img =
+              vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16);
           scaled_img->bit_depth = img->bit_depth;
         }
 
@@ -881,7 +845,8 @@ static int main_loop(int argc, const char **argv_) {
           libyuv_scale(img, scaled_img, kFilterBox);
           img = scaled_img;
 #else
-          fprintf(stderr, "Failed  to scale output frame: %s.\n"
+          fprintf(stderr,
+                  "Failed  to scale output frame: %s.\n"
                   "Scaling is disabled in this configuration. "
                   "To enable scaling, configure with --enable-libyuv\n",
                   vpx_codec_error(&decoder));
@@ -896,22 +861,22 @@ static int main_loop(int argc, const char **argv_) {
       }
       // Shift up or down if necessary
       if (output_bit_depth != 0 && output_bit_depth != img->bit_depth) {
-        const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ?
-            img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) :
-            img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
+        const vpx_img_fmt_t shifted_fmt =
+            output_bit_depth == 8
+                ? img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH)
+                : img->fmt | VPX_IMG_FMT_HIGHBITDEPTH;
         if (img_shifted &&
             img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
           vpx_img_free(img_shifted);
           img_shifted = NULL;
         }
         if (!img_shifted) {
-          img_shifted = vpx_img_alloc(NULL, shifted_fmt,
-                                      img->d_w, img->d_h, 16);
+          img_shifted =
+              vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
           img_shifted->bit_depth = output_bit_depth;
         }
         if (output_bit_depth > img->bit_depth) {
-          vpx_img_upshift(img_shifted, img,
-                          output_bit_depth - img->bit_depth);
+          vpx_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth);
         } else {
           vpx_img_downshift(img_shifted, img,
                             img->bit_depth - output_bit_depth);
@@ -927,7 +892,7 @@ static int main_loop(int argc, const char **argv_) {
 
       if (single_file) {
         if (use_y4m) {
-          char buf[Y4M_BUFFER_SIZE] = {0};
+          char buf[Y4M_BUFFER_SIZE] = { 0 };
           size_t len = 0;
           if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
             fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
@@ -935,11 +900,9 @@ static int main_loop(int argc, const char **argv_) {
           }
           if (frame_out == 1) {
             // Y4M file header
-            len = y4m_write_file_header(buf, sizeof(buf),
-                                        vpx_input_ctx.width,
-                                        vpx_input_ctx.height,
-                                        &vpx_input_ctx.framerate,
-                                        img->fmt, img->bit_depth);
+            len = y4m_write_file_header(
+                buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height,
+                &vpx_input_ctx.framerate, img->fmt, img->bit_depth);
             if (do_md5) {
               MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
             } else {
@@ -967,7 +930,8 @@ static int main_loop(int argc, const char **argv_) {
             }
             if (opt_yv12) {
               if ((img->fmt != VPX_IMG_FMT_I420 &&
-                   img->fmt != VPX_IMG_FMT_YV12) || img->bit_depth != 8) {
+                   img->fmt != VPX_IMG_FMT_YV12) ||
+                  img->bit_depth != 8) {
                 fprintf(stderr, "Cannot produce yv12 output for bit-stream.\n");
                 goto fail;
               }
@@ -981,8 +945,8 @@ static int main_loop(int argc, const char **argv_) {
           write_image_file(img, planes, outfile);
         }
       } else {
-        generate_filename(outfile_pattern, outfile_name, PATH_MAX,
-                          img->d_w, img->d_h, frame_in);
+        generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
+                          img->d_h, frame_in);
         if (do_md5) {
           MD5Init(&md5_ctx);
           update_image_md5(img, planes, &md5_ctx);
@@ -1027,8 +991,7 @@ fail:
     webm_free(input.webm_ctx);
 #endif
 
-  if (input.vpx_input_ctx->file_type != FILE_TYPE_WEBM)
-    free(buf);
+  if (input.vpx_input_ctx->file_type != FILE_TYPE_WEBM) free(buf);
 
   if (scaled_img) vpx_img_free(scaled_img);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1063,7 +1026,6 @@ int main(int argc, const char **argv_) {
     }
   }
   free(argv);
-  for (i = 0; !error && i < loops; i++)
-    error = main_loop(argc, argv_);
+  for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_);
   return error;
 }
diff --git a/vpxenc.c b/vpxenc.c
index 30d9696e13628c0d1edcd2cdc88793d3daf54946..1bc060b4f6c57f37e38aba02c80783e016c0c731 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -51,8 +51,7 @@
 #include "./y4minput.h"
 
 /* Swallow warnings about unused results of fread/fwrite */
-static size_t wrap_fread(void *ptr, size_t size, size_t nmemb,
-                         FILE *stream) {
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
   return fread(ptr, size, nmemb, stream);
 }
 #define fread wrap_fread
@@ -63,7 +62,6 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
 }
 #define fwrite wrap_fwrite
 
-
 static const char *exec_name;
 
 static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
@@ -74,11 +72,9 @@ static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
     vfprintf(stderr, s, ap);
     fprintf(stderr, ": %s\n", vpx_codec_error(ctx));
 
-    if (detail)
-      fprintf(stderr, "    %s\n", detail);
+    if (detail) fprintf(stderr, "    %s\n", detail);
 
-    if (fatal)
-      exit(EXIT_FAILURE);
+    if (fatal) exit(EXIT_FAILURE);
   }
 }
 
@@ -105,8 +101,7 @@ static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
   int shortread = 0;
 
   if (input_ctx->file_type == FILE_TYPE_Y4M) {
-    if (y4m_input_fetch_frame(y4m, f, img) < 1)
-      return 0;
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
   } else {
     shortread = read_yuv_frame(input_ctx, img);
   }
@@ -128,252 +123,262 @@ static int fourcc_is_ivf(const char detect[4]) {
   return 0;
 }
 
-static const arg_def_t debugmode = ARG_DEF(
-    "D", "debug", 0, "Debug mode (makes output deterministic)");
-static const arg_def_t outputfile = ARG_DEF(
-    "o", "output", 1, "Output filename");
-static const arg_def_t use_yv12 = ARG_DEF(
-    NULL, "yv12", 0, "Input file is YV12 ");
-static const arg_def_t use_i420 = ARG_DEF(
-    NULL, "i420", 0, "Input file is I420 (default)");
-static const arg_def_t use_i422 = ARG_DEF(
-    NULL, "i422", 0, "Input file is I422");
-static const arg_def_t use_i444 = ARG_DEF(
-    NULL, "i444", 0, "Input file is I444");
-static const arg_def_t use_i440 = ARG_DEF(
-    NULL, "i440", 0, "Input file is I440");
-static const arg_def_t codecarg = ARG_DEF(
-    NULL, "codec", 1, "Codec to use");
-static const arg_def_t passes = ARG_DEF(
-    "p", "passes", 1, "Number of passes (1/2)");
-static const arg_def_t pass_arg = ARG_DEF(
-    NULL, "pass", 1, "Pass to execute (1/2)");
-static const arg_def_t fpf_name = ARG_DEF(
-    NULL, "fpf", 1, "First pass statistics file name");
+static const arg_def_t debugmode =
+    ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t use_yv12 =
+    ARG_DEF(NULL, "yv12", 0, "Input file is YV12 ");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)");
+static const arg_def_t use_i422 =
+    ARG_DEF(NULL, "i422", 0, "Input file is I422");
+static const arg_def_t use_i444 =
+    ARG_DEF(NULL, "i444", 0, "Input file is I444");
+static const arg_def_t use_i440 =
+    ARG_DEF(NULL, "i440", 0, "Input file is I440");
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
+static const arg_def_t passes =
+    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg =
+    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name =
+    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
 #if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name = ARG_DEF(
-    NULL, "fpmbf", 1, "First pass block statistics file name");
+static const arg_def_t fpmbf_name =
+    ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
 #endif
-static const arg_def_t limit = ARG_DEF(
-    NULL, "limit", 1, "Stop encoding after n input frames");
-static const arg_def_t skip = ARG_DEF(
-    NULL, "skip", 1, "Skip the first n input frames");
-static const arg_def_t deadline = ARG_DEF(
-    "d", "deadline", 1, "Deadline per frame (usec)");
-static const arg_def_t best_dl = ARG_DEF(
-    NULL, "best", 0, "Use Best Quality Deadline");
-static const arg_def_t good_dl = ARG_DEF(
-    NULL, "good", 0, "Use Good Quality Deadline");
-static const arg_def_t rt_dl = ARG_DEF(
-    NULL, "rt", 0, "Use Realtime Quality Deadline");
-static const arg_def_t quietarg = ARG_DEF(
-    "q", "quiet", 0, "Do not print encode progress");
-static const arg_def_t verbosearg = ARG_DEF(
-    "v", "verbose", 0, "Show encoder parameters");
-static const arg_def_t psnrarg = ARG_DEF(
-    NULL, "psnr", 0, "Show PSNR in status line");
+static const arg_def_t limit =
+    ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
+static const arg_def_t skip =
+    ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t deadline =
+    ARG_DEF("d", "deadline", 1, "Deadline per frame (usec)");
+static const arg_def_t best_dl =
+    ARG_DEF(NULL, "best", 0, "Use Best Quality Deadline");
+static const arg_def_t good_dl =
+    ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl =
+    ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline");
+static const arg_def_t quietarg =
+    ARG_DEF("q", "quiet", 0, "Do not print encode progress");
+static const arg_def_t verbosearg =
+    ARG_DEF("v", "verbose", 0, "Show encoder parameters");
+static const arg_def_t psnrarg =
+    ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line");
 
 static const struct arg_enum_list test_decode_enum[] = {
-  {"off",   TEST_DECODE_OFF},
-  {"fatal", TEST_DECODE_FATAL},
-  {"warn",  TEST_DECODE_WARN},
-  {NULL, 0}
+  { "off", TEST_DECODE_OFF },
+  { "fatal", TEST_DECODE_FATAL },
+  { "warn", TEST_DECODE_WARN },
+  { NULL, 0 }
 };
 static const arg_def_t recontest = ARG_DEF_ENUM(
     NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
-static const arg_def_t framerate = ARG_DEF(
-    NULL, "fps", 1, "Stream frame rate (rate/scale)");
-static const arg_def_t use_webm = ARG_DEF(
-    NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
-static const arg_def_t use_ivf = ARG_DEF(
-    NULL, "ivf", 0, "Output IVF");
-static const arg_def_t out_part = ARG_DEF(
-    "P", "output-partitions", 0,
-    "Makes encoder output partitions. Requires IVF output!");
-static const arg_def_t q_hist_n = ARG_DEF(
-    NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
-static const arg_def_t rate_hist_n = ARG_DEF(
-    NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
-static const arg_def_t disable_warnings = ARG_DEF(
-    NULL, "disable-warnings", 0,
-    "Disable warnings about potentially incorrect encode settings.");
-static const arg_def_t disable_warning_prompt = ARG_DEF(
-    "y", "disable-warning-prompt", 0,
-    "Display warnings, but do not prompt user to continue.");
+static const arg_def_t framerate =
+    ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_webm =
+    ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
+static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF");
+static const arg_def_t out_part =
+    ARG_DEF("P", "output-partitions", 0,
+            "Makes encoder output partitions. Requires IVF output!");
+static const arg_def_t q_hist_n =
+    ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
+static const arg_def_t rate_hist_n =
+    ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
+static const arg_def_t disable_warnings =
+    ARG_DEF(NULL, "disable-warnings", 0,
+            "Disable warnings about potentially incorrect encode settings.");
+static const arg_def_t disable_warning_prompt =
+    ARG_DEF("y", "disable-warning-prompt", 0,
+            "Display warnings, but do not prompt user to continue.");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const arg_def_t test16bitinternalarg = ARG_DEF(
     NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
 #endif
 
-static const arg_def_t *main_args[] = {
-  &debugmode,
-  &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &skip,
-  &deadline, &best_dl, &good_dl, &rt_dl,
-  &quietarg, &verbosearg, &psnrarg, &use_webm, &use_ivf, &out_part, &q_hist_n,
-  &rate_hist_n, &disable_warnings, &disable_warning_prompt, &recontest,
-  NULL
-};
-
-static const arg_def_t usage = ARG_DEF(
-    "u", "usage", 1, "Usage profile number to use");
-static const arg_def_t threads = ARG_DEF(
-    "t", "threads", 1, "Max number of threads to use");
-static const arg_def_t profile = ARG_DEF(
-    NULL, "profile", 1, "Bitstream profile number to use");
+static const arg_def_t *main_args[] = { &debugmode,
+                                        &outputfile,
+                                        &codecarg,
+                                        &passes,
+                                        &pass_arg,
+                                        &fpf_name,
+                                        &limit,
+                                        &skip,
+                                        &deadline,
+                                        &best_dl,
+                                        &good_dl,
+                                        &rt_dl,
+                                        &quietarg,
+                                        &verbosearg,
+                                        &psnrarg,
+                                        &use_webm,
+                                        &use_ivf,
+                                        &out_part,
+                                        &q_hist_n,
+                                        &rate_hist_n,
+                                        &disable_warnings,
+                                        &disable_warning_prompt,
+                                        &recontest,
+                                        NULL };
+
+static const arg_def_t usage =
+    ARG_DEF("u", "usage", 1, "Usage profile number to use");
+static const arg_def_t threads =
+    ARG_DEF("t", "threads", 1, "Max number of threads to use");
+static const arg_def_t profile =
+    ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use");
 static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
 static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
 #if CONFIG_WEBM_IO
 static const struct arg_enum_list stereo_mode_enum[] = {
-  {"mono", STEREO_FORMAT_MONO},
-  {"left-right", STEREO_FORMAT_LEFT_RIGHT},
-  {"bottom-top", STEREO_FORMAT_BOTTOM_TOP},
-  {"top-bottom", STEREO_FORMAT_TOP_BOTTOM},
-  {"right-left", STEREO_FORMAT_RIGHT_LEFT},
-  {NULL, 0}
+  { "mono", STEREO_FORMAT_MONO },
+  { "left-right", STEREO_FORMAT_LEFT_RIGHT },
+  { "bottom-top", STEREO_FORMAT_BOTTOM_TOP },
+  { "top-bottom", STEREO_FORMAT_TOP_BOTTOM },
+  { "right-left", STEREO_FORMAT_RIGHT_LEFT },
+  { NULL, 0 }
 };
 static const arg_def_t stereo_mode = ARG_DEF_ENUM(
     NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum);
 #endif
 static const arg_def_t timebase = ARG_DEF(
     NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
-static const arg_def_t error_resilient = ARG_DEF(
-    NULL, "error-resilient", 1, "Enable error resiliency features");
-static const arg_def_t lag_in_frames = ARG_DEF(
-    NULL, "lag-in-frames", 1, "Max number of frames to lag");
-
-static const arg_def_t *global_args[] = {
-  &use_yv12, &use_i420, &use_i422, &use_i444, &use_i440,
-  &usage, &threads, &profile,
-  &width, &height,
+static const arg_def_t error_resilient =
+    ARG_DEF(NULL, "error-resilient", 1, "Enable error resiliency features");
+static const arg_def_t lag_in_frames =
+    ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
+
+static const arg_def_t *global_args[] = { &use_yv12,
+                                          &use_i420,
+                                          &use_i422,
+                                          &use_i444,
+                                          &use_i440,
+                                          &usage,
+                                          &threads,
+                                          &profile,
+                                          &width,
+                                          &height,
 #if CONFIG_WEBM_IO
-  &stereo_mode,
+                                          &stereo_mode,
 #endif
-  &timebase, &framerate,
-  &error_resilient,
+                                          &timebase,
+                                          &framerate,
+                                          &error_resilient,
 #if CONFIG_VP9_HIGHBITDEPTH
-  &test16bitinternalarg,
+                                          &test16bitinternalarg,
 #endif
-  &lag_in_frames, NULL
-};
-
-static const arg_def_t dropframe_thresh = ARG_DEF(
-    NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
-static const arg_def_t resize_allowed = ARG_DEF(
-    NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)");
-static const arg_def_t resize_width = ARG_DEF(
-    NULL, "resize-width", 1, "Width of encoded frame");
-static const arg_def_t resize_height = ARG_DEF(
-    NULL, "resize-height", 1, "Height of encoded frame");
-static const arg_def_t resize_up_thresh = ARG_DEF(
-    NULL, "resize-up", 1, "Upscale threshold (buf %)");
-static const arg_def_t resize_down_thresh = ARG_DEF(
-    NULL, "resize-down", 1, "Downscale threshold (buf %)");
-static const struct arg_enum_list end_usage_enum[] = {
-  {"vbr", VPX_VBR},
-  {"cbr", VPX_CBR},
-  {"cq",  VPX_CQ},
-  {"q",   VPX_Q},
-  {NULL, 0}
-};
-static const arg_def_t end_usage = ARG_DEF_ENUM(
-    NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
-static const arg_def_t target_bitrate = ARG_DEF(
-    NULL, "target-bitrate", 1, "Bitrate (kbps)");
-static const arg_def_t min_quantizer = ARG_DEF(
-    NULL, "min-q", 1, "Minimum (best) quantizer");
-static const arg_def_t max_quantizer = ARG_DEF(
-    NULL, "max-q", 1, "Maximum (worst) quantizer");
-static const arg_def_t undershoot_pct = ARG_DEF(
-    NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
-static const arg_def_t overshoot_pct = ARG_DEF(
-    NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
-static const arg_def_t buf_sz = ARG_DEF(
-    NULL, "buf-sz", 1, "Client buffer size (ms)");
-static const arg_def_t buf_initial_sz = ARG_DEF(
-    NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
-static const arg_def_t buf_optimal_sz = ARG_DEF(
-    NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
+                                          &lag_in_frames,
+                                          NULL };
+
+static const arg_def_t dropframe_thresh =
+    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t resize_allowed =
+    ARG_DEF(NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)");
+static const arg_def_t resize_width =
+    ARG_DEF(NULL, "resize-width", 1, "Width of encoded frame");
+static const arg_def_t resize_height =
+    ARG_DEF(NULL, "resize-height", 1, "Height of encoded frame");
+static const arg_def_t resize_up_thresh =
+    ARG_DEF(NULL, "resize-up", 1, "Upscale threshold (buf %)");
+static const arg_def_t resize_down_thresh =
+    ARG_DEF(NULL, "resize-down", 1, "Downscale threshold (buf %)");
+static const struct arg_enum_list end_usage_enum[] = { { "vbr", VPX_VBR },
+                                                       { "cbr", VPX_CBR },
+                                                       { "cq", VPX_CQ },
+                                                       { "q", VPX_Q },
+                                                       { NULL, 0 } };
+static const arg_def_t end_usage =
+    ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
+static const arg_def_t target_bitrate =
+    ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)");
+static const arg_def_t min_quantizer =
+    ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer");
+static const arg_def_t max_quantizer =
+    ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer");
+static const arg_def_t undershoot_pct =
+    ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
+static const arg_def_t overshoot_pct =
+    ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
+static const arg_def_t buf_sz =
+    ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)");
+static const arg_def_t buf_initial_sz =
+    ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
+static const arg_def_t buf_optimal_sz =
+    ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
 static const arg_def_t *rc_args[] = {
-  &dropframe_thresh, &resize_allowed, &resize_width, &resize_height,
-  &resize_up_thresh, &resize_down_thresh, &end_usage, &target_bitrate,
-  &min_quantizer, &max_quantizer, &undershoot_pct, &overshoot_pct, &buf_sz,
-  &buf_initial_sz, &buf_optimal_sz, NULL
-};
-
-
-static const arg_def_t bias_pct = ARG_DEF(
-    NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
-static const arg_def_t minsection_pct = ARG_DEF(
-    NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
-static const arg_def_t maxsection_pct = ARG_DEF(
-    NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
-static const arg_def_t *rc_twopass_args[] = {
-  &bias_pct, &minsection_pct, &maxsection_pct, NULL
-};
-
-
-static const arg_def_t kf_min_dist = ARG_DEF(
-    NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
-static const arg_def_t kf_max_dist = ARG_DEF(
-    NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
-static const arg_def_t kf_disabled = ARG_DEF(
-    NULL, "disable-kf", 0, "Disable keyframe placement");
-static const arg_def_t *kf_args[] = {
-  &kf_min_dist, &kf_max_dist, &kf_disabled, NULL
+  &dropframe_thresh, &resize_allowed,     &resize_width,   &resize_height,
+  &resize_up_thresh, &resize_down_thresh, &end_usage,      &target_bitrate,
+  &min_quantizer,    &max_quantizer,      &undershoot_pct, &overshoot_pct,
+  &buf_sz,           &buf_initial_sz,     &buf_optimal_sz, NULL
 };
 
-
-static const arg_def_t noise_sens = ARG_DEF(
-    NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
-static const arg_def_t sharpness = ARG_DEF(
-    NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
-static const arg_def_t static_thresh = ARG_DEF(
-    NULL, "static-thresh", 1, "Motion detection threshold");
-static const arg_def_t auto_altref = ARG_DEF(
-    NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
-static const arg_def_t arnr_maxframes = ARG_DEF(
-    NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
-static const arg_def_t arnr_strength = ARG_DEF(
-    NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
-static const arg_def_t arnr_type = ARG_DEF(
-    NULL, "arnr-type", 1, "AltRef type");
+static const arg_def_t bias_pct =
+    ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
+static const arg_def_t minsection_pct =
+    ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
+static const arg_def_t maxsection_pct =
+    ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
+static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
+                                              &maxsection_pct, NULL };
+
+static const arg_def_t kf_min_dist =
+    ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
+static const arg_def_t kf_max_dist =
+    ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
+static const arg_def_t kf_disabled =
+    ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement");
+static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled,
+                                      NULL };
+
+static const arg_def_t noise_sens =
+    ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
+static const arg_def_t sharpness =
+    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+static const arg_def_t static_thresh =
+    ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold");
+static const arg_def_t auto_altref =
+    ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
+static const arg_def_t arnr_maxframes =
+    ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
+static const arg_def_t arnr_strength =
+    ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
+static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, "AltRef type");
 static const struct arg_enum_list tuning_enum[] = {
-  {"psnr", VPX_TUNE_PSNR},
-  {"ssim", VPX_TUNE_SSIM},
-  {NULL, 0}
+  { "psnr", VPX_TUNE_PSNR }, { "ssim", VPX_TUNE_SSIM }, { NULL, 0 }
 };
-static const arg_def_t tune_ssim = ARG_DEF_ENUM(
-    NULL, "tune", 1, "Material to favor", tuning_enum);
-static const arg_def_t cq_level = ARG_DEF(
-    NULL, "cq-level", 1, "Constant/Constrained Quality level");
-static const arg_def_t max_intra_rate_pct = ARG_DEF(
-    NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
-
+static const arg_def_t tune_ssim =
+    ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum);
+static const arg_def_t cq_level =
+    ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level");
+static const arg_def_t max_intra_rate_pct =
+    ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
 
 #if CONFIG_VP10_ENCODER
-static const arg_def_t cpu_used_vp9 = ARG_DEF(
-    NULL, "cpu-used", 1, "CPU Used (-8..8)");
-static const arg_def_t tile_cols = ARG_DEF(
-    NULL, "tile-columns", 1, "Number of tile columns to use, log2");
-static const arg_def_t tile_rows = ARG_DEF(
-    NULL, "tile-rows", 1,
-    "Number of tile rows to use, log2 (set to 0 while threads > 1)");
-static const arg_def_t lossless = ARG_DEF(
-    NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
+static const arg_def_t cpu_used_vp9 =
+    ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-8..8)");
+static const arg_def_t tile_cols =
+    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows =
+    ARG_DEF(NULL, "tile-rows", 1,
+            "Number of tile rows to use, log2 (set to 0 while threads > 1)");
+static const arg_def_t lossless =
+    ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
     NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
 static const arg_def_t aq_mode = ARG_DEF(
     NULL, "aq-mode", 1,
     "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
     "3: cyclic refresh, 4: equator360)");
-static const arg_def_t frame_periodic_boost = ARG_DEF(
-    NULL, "frame-boost", 1,
-    "Enable frame periodic boost (0: off (default), 1: on)");
+static const arg_def_t frame_periodic_boost =
+    ARG_DEF(NULL, "frame-boost", 1,
+            "Enable frame periodic boost (0: off (default), 1: on)");
 static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
     NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
-static const arg_def_t max_inter_rate_pct = ARG_DEF(
-    NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t max_inter_rate_pct =
+    ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
 static const arg_def_t min_gf_interval = ARG_DEF(
     NULL, "min-gf-interval", 1,
     "min gf/arf frame interval (default 0, indicating in-built behavior)");
@@ -393,30 +398,27 @@ static const struct arg_enum_list color_space_enum[] = {
   { NULL, 0 }
 };
 
-static const arg_def_t input_color_space = ARG_DEF_ENUM(
-    NULL, "color-space", 1,
-    "The color space of input content:", color_space_enum);
+static const arg_def_t input_color_space =
+    ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:",
+                 color_space_enum);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
-  {"8",  VPX_BITS_8},
-  {"10", VPX_BITS_10},
-  {"12", VPX_BITS_12},
-  {NULL, 0}
+  { "8", VPX_BITS_8 }, { "10", VPX_BITS_10 }, { "12", VPX_BITS_12 }, { NULL, 0 }
 };
 
 static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
     "b", "bit-depth", 1,
     "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)",
     bitdepth_enum);
-static const arg_def_t inbitdeptharg = ARG_DEF(
-    NULL, "input-bit-depth", 1, "Bit depth of input");
+static const arg_def_t inbitdeptharg =
+    ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input");
 #endif
 
 static const struct arg_enum_list tune_content_enum[] = {
-  {"default", VPX_CONTENT_DEFAULT},
-  {"screen", VPX_CONTENT_SCREEN},
-  {NULL, 0}
+  { "default", VPX_CONTENT_DEFAULT },
+  { "screen", VPX_CONTENT_SCREEN },
+  { NULL, 0 }
 };
 
 static const arg_def_t tune_content = ARG_DEF_ENUM(
@@ -428,51 +430,76 @@ static const arg_def_t target_level = ARG_DEF(
     " 11: level 1.1; ... 62: level 6.2)");
 #endif
 
-
 #if CONFIG_VP10_ENCODER
 #if CONFIG_EXT_PARTITION
 static const struct arg_enum_list superblock_size_enum[] = {
-  {"dynamic", VPX_SUPERBLOCK_SIZE_DYNAMIC},
-  {"64", VPX_SUPERBLOCK_SIZE_64X64},
-  {"128", VPX_SUPERBLOCK_SIZE_128X128},
-  {NULL, 0}
+  { "dynamic", VPX_SUPERBLOCK_SIZE_DYNAMIC },
+  { "64", VPX_SUPERBLOCK_SIZE_64X64 },
+  { "128", VPX_SUPERBLOCK_SIZE_128X128 },
+  { NULL, 0 }
 };
 static const arg_def_t superblock_size = ARG_DEF_ENUM(
     NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
 #endif  // CONFIG_EXT_PARTITION
 
-static const arg_def_t *vp10_args[] = {
-  &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
-  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
-  &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
-  &gf_cbr_boost_pct, &lossless,
-  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
-  &noise_sens, &tune_content, &input_color_space,
-  &min_gf_interval, &max_gf_interval,
+static const arg_def_t *vp10_args[] = { &cpu_used_vp9,
+                                        &auto_altref,
+                                        &sharpness,
+                                        &static_thresh,
+                                        &tile_cols,
+                                        &tile_rows,
+                                        &arnr_maxframes,
+                                        &arnr_strength,
+                                        &arnr_type,
+                                        &tune_ssim,
+                                        &cq_level,
+                                        &max_intra_rate_pct,
+                                        &max_inter_rate_pct,
+                                        &gf_cbr_boost_pct,
+                                        &lossless,
+                                        &frame_parallel_decoding,
+                                        &aq_mode,
+                                        &frame_periodic_boost,
+                                        &noise_sens,
+                                        &tune_content,
+                                        &input_color_space,
+                                        &min_gf_interval,
+                                        &max_gf_interval,
 #if CONFIG_EXT_PARTITION
-  &superblock_size,
+                                        &superblock_size,
 #endif  // CONFIG_EXT_PARTITION
 #if CONFIG_VP9_HIGHBITDEPTH
-  &bitdeptharg, &inbitdeptharg,
+                                        &bitdeptharg,
+                                        &inbitdeptharg,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  NULL
-};
-static const int vp10_arg_ctrl_map[] = {
-  VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
-  VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
-  VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
-  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
-  VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
-  VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
-  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
-  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
-  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
-  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+                                        NULL };
+static const int vp10_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
+                                         VP8E_SET_ENABLEAUTOALTREF,
+                                         VP8E_SET_SHARPNESS,
+                                         VP8E_SET_STATIC_THRESHOLD,
+                                         VP9E_SET_TILE_COLUMNS,
+                                         VP9E_SET_TILE_ROWS,
+                                         VP8E_SET_ARNR_MAXFRAMES,
+                                         VP8E_SET_ARNR_STRENGTH,
+                                         VP8E_SET_ARNR_TYPE,
+                                         VP8E_SET_TUNING,
+                                         VP8E_SET_CQ_LEVEL,
+                                         VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                                         VP9E_SET_MAX_INTER_BITRATE_PCT,
+                                         VP9E_SET_GF_CBR_BOOST_PCT,
+                                         VP9E_SET_LOSSLESS,
+                                         VP9E_SET_FRAME_PARALLEL_DECODING,
+                                         VP9E_SET_AQ_MODE,
+                                         VP9E_SET_FRAME_PERIODIC_BOOST,
+                                         VP9E_SET_NOISE_SENSITIVITY,
+                                         VP9E_SET_TUNE_CONTENT,
+                                         VP9E_SET_COLOR_SPACE,
+                                         VP9E_SET_MIN_GF_INTERVAL,
+                                         VP9E_SET_MAX_GF_INTERVAL,
 #if CONFIG_EXT_PARTITION
-  VP10E_SET_SUPERBLOCK_SIZE,
+                                         VP10E_SET_SUPERBLOCK_SIZE,
 #endif  // CONFIG_EXT_PARTITION
-  0
-};
+                                         0 };
 #endif
 
 static const arg_def_t *no_args[] = { NULL };
@@ -498,17 +525,17 @@ void usage_exit(void) {
   fprintf(stderr, "\nVP10 Specific Options:\n");
   arg_show_usage(stderr, vp10_args);
 #endif
-  fprintf(stderr, "\nStream timebase (--timebase):\n"
+  fprintf(stderr,
+          "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
           "  in fractional seconds. Default is 1/1000.\n");
   fprintf(stderr, "\nIncluded encoders:\n\n");
 
   for (i = 0; i < num_encoder; ++i) {
     const VpxInterface *const encoder = get_vpx_encoder_by_index(i);
-    const char* defstr = (i == (num_encoder - 1)) ? "(default)" : "";
-      fprintf(stderr, "    %-6s - %s %s\n",
-              encoder->name, vpx_codec_iface_name(encoder->codec_interface()),
-              defstr);
+    const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
+    fprintf(stderr, "    %-6s - %s %s\n", encoder->name,
+            vpx_codec_iface_name(encoder->codec_interface()), defstr);
   }
   fprintf(stderr, "\n        ");
   fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
@@ -516,12 +543,12 @@ void usage_exit(void) {
   exit(EXIT_FAILURE);
 }
 
-#define mmin(a, b)  ((a) < (b) ? (a) : (b))
+#define mmin(a, b) ((a) < (b) ? (a) : (b))
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void find_mismatch_high(const vpx_image_t *const img1,
-                               const vpx_image_t *const img2,
-                               int yloc[4], int uloc[4], int vloc[4]) {
+                               const vpx_image_t *const img2, int yloc[4],
+                               int uloc[4], int vloc[4]) {
   uint16_t *plane1, *plane2;
   uint32_t stride1, stride2;
   const uint32_t bsize = 64;
@@ -534,10 +561,10 @@ static void find_mismatch_high(const vpx_image_t *const img1,
   int match = 1;
   uint32_t i, j;
   yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
-  plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y];
-  plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y];
-  stride1 = img1->stride[VPX_PLANE_Y]/2;
-  stride2 = img2->stride[VPX_PLANE_Y]/2;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y];
+  stride1 = img1->stride[VPX_PLANE_Y] / 2;
+  stride2 = img2->stride[VPX_PLANE_Y] / 2;
   for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
     for (j = 0; match && j < img1->d_w; j += bsize) {
       int k, l;
@@ -560,10 +587,10 @@ static void find_mismatch_high(const vpx_image_t *const img1,
   }
 
   uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  plane1 = (uint16_t*)img1->planes[VPX_PLANE_U];
-  plane2 = (uint16_t*)img2->planes[VPX_PLANE_U];
-  stride1 = img1->stride[VPX_PLANE_U]/2;
-  stride2 = img2->stride[VPX_PLANE_U]/2;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_U];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_U];
+  stride1 = img1->stride[VPX_PLANE_U] / 2;
+  stride2 = img2->stride[VPX_PLANE_U] / 2;
   for (i = 0, match = 1; match && i < c_h; i += bsizey) {
     for (j = 0; match && j < c_w; j += bsizex) {
       int k, l;
@@ -586,10 +613,10 @@ static void find_mismatch_high(const vpx_image_t *const img1,
   }
 
   vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  plane1 = (uint16_t*)img1->planes[VPX_PLANE_V];
-  plane2 = (uint16_t*)img2->planes[VPX_PLANE_V];
-  stride1 = img1->stride[VPX_PLANE_V]/2;
-  stride2 = img2->stride[VPX_PLANE_V]/2;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_V];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_V];
+  stride1 = img1->stride[VPX_PLANE_V] / 2;
+  stride2 = img2->stride[VPX_PLANE_V] / 2;
   for (i = 0, match = 1; match && i < c_h; i += bsizey) {
     for (j = 0; match && j < c_w; j += bsizex) {
       int k, l;
@@ -614,8 +641,8 @@ static void find_mismatch_high(const vpx_image_t *const img1,
 #endif
 
 static void find_mismatch(const vpx_image_t *const img1,
-                          const vpx_image_t *const img2,
-                          int yloc[4], int uloc[4], int vloc[4]) {
+                          const vpx_image_t *const img2, int yloc[4],
+                          int uloc[4], int vloc[4]) {
   const uint32_t bsize = 64;
   const uint32_t bsizey = bsize >> img1->y_chroma_shift;
   const uint32_t bsizex = bsize >> img1->x_chroma_shift;
@@ -706,8 +733,7 @@ static void find_mismatch(const vpx_image_t *const img1,
 static int compare_img(const vpx_image_t *const img1,
                        const vpx_image_t *const img2) {
   uint32_t l_w = img1->d_w;
-  uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
   const uint32_t c_h =
       (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
   uint32_t i;
@@ -741,84 +767,79 @@ static int compare_img(const vpx_image_t *const img1,
   return match;
 }
 
-
-#define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
+#define NELEMENTS(x) (sizeof(x) / sizeof(x[0]))
 #if CONFIG_VP10_ENCODER
 #define ARG_CTRL_CNT_MAX NELEMENTS(vp10_arg_ctrl_map)
 #endif
 
 #if !CONFIG_WEBM_IO
 typedef int stereo_format_t;
-struct WebmOutputContext { int debug; };
+struct WebmOutputContext {
+  int debug;
+};
 #endif
 
 /* Per-stream configuration */
 struct stream_config {
-  struct vpx_codec_enc_cfg  cfg;
-  const char               *out_fn;
-  const char               *stats_fn;
+  struct vpx_codec_enc_cfg cfg;
+  const char *out_fn;
+  const char *stats_fn;
 #if CONFIG_FP_MB_STATS
-  const char               *fpmb_stats_fn;
+  const char *fpmb_stats_fn;
 #endif
-  stereo_format_t           stereo_fmt;
-  int                       arg_ctrls[ARG_CTRL_CNT_MAX][2];
-  int                       arg_ctrl_cnt;
-  int                       write_webm;
+  stereo_format_t stereo_fmt;
+  int arg_ctrls[ARG_CTRL_CNT_MAX][2];
+  int arg_ctrl_cnt;
+  int write_webm;
 #if CONFIG_VP9_HIGHBITDEPTH
   // whether to use 16bit internal buffers
-  int                       use_16bit_internal;
+  int use_16bit_internal;
 #endif
 };
 
-
 struct stream_state {
-  int                       index;
-  struct stream_state      *next;
-  struct stream_config      config;
-  FILE                     *file;
-  struct rate_hist         *rate_hist;
-  struct WebmOutputContext  webm_ctx;
-  uint64_t                  psnr_sse_total;
-  uint64_t                  psnr_samples_total;
-  double                    psnr_totals[4];
-  int                       psnr_count;
-  int                       counts[64];
-  vpx_codec_ctx_t           encoder;
-  unsigned int              frames_out;
-  uint64_t                  cx_time;
-  size_t                    nbytes;
-  stats_io_t                stats;
+  int index;
+  struct stream_state *next;
+  struct stream_config config;
+  FILE *file;
+  struct rate_hist *rate_hist;
+  struct WebmOutputContext webm_ctx;
+  uint64_t psnr_sse_total;
+  uint64_t psnr_samples_total;
+  double psnr_totals[4];
+  int psnr_count;
+  int counts[64];
+  vpx_codec_ctx_t encoder;
+  unsigned int frames_out;
+  uint64_t cx_time;
+  size_t nbytes;
+  stats_io_t stats;
 #if CONFIG_FP_MB_STATS
-  stats_io_t                fpmb_stats;
+  stats_io_t fpmb_stats;
 #endif
-  struct vpx_image         *img;
-  vpx_codec_ctx_t           decoder;
-  int                       mismatch_seen;
+  struct vpx_image *img;
+  vpx_codec_ctx_t decoder;
+  int mismatch_seen;
 };
 
-
-static void validate_positive_rational(const char          *msg,
+static void validate_positive_rational(const char *msg,
                                        struct vpx_rational *rat) {
   if (rat->den < 0) {
     rat->num *= -1;
     rat->den *= -1;
   }
 
-  if (rat->num < 0)
-    die("Error: %s must be positive\n", msg);
+  if (rat->num < 0) die("Error: %s must be positive\n", msg);
 
-  if (!rat->den)
-    die("Error: %s has zero denominator\n", msg);
+  if (!rat->den) die("Error: %s has zero denominator\n", msg);
 }
 
-
 static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
-  char       **argi, **argj;
-  struct arg   arg;
+  char **argi, **argj;
+  struct arg arg;
   const int num_encoder = get_vpx_encoder_count();
 
-  if (num_encoder < 1)
-    die("Error: no valid encoder available\n");
+  if (num_encoder < 1) die("Error: no valid encoder available\n");
 
   /* Initialize default parameters */
   memset(global, 0, sizeof(*global));
@@ -844,8 +865,7 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
       global->pass = arg_parse_uint(&arg);
 
       if (global->pass < 1 || global->pass > 2)
-        die("Error: Invalid pass selected (%d)\n",
-            global->pass);
+        die("Error: Invalid pass selected (%d)\n", global->pass);
     } else if (arg_match(&arg, &usage, argi))
       global->usage = arg_parse_uint(&arg);
     else if (arg_match(&arg, &deadline, argi))
@@ -901,8 +921,8 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   if (global->pass) {
     /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
     if (global->pass > global->passes) {
-      warn("Assuming --pass=%d implies --passes=%d\n",
-           global->pass, global->pass);
+      warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
+           global->pass);
       global->passes = global->pass;
     }
   }
@@ -913,27 +933,26 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
     // encoder
     if (global->codec != NULL && global->codec->name != NULL)
       global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
-                        global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+                        global->deadline != VPX_DL_REALTIME)
+                           ? 2
+                           : 1;
 #else
     global->passes = 1;
 #endif
   }
 
-  if (global->deadline == VPX_DL_REALTIME &&
-      global->passes > 1) {
+  if (global->deadline == VPX_DL_REALTIME && global->passes > 1) {
     warn("Enforcing one-pass encoding in realtime mode\n");
     global->passes = 1;
   }
 }
 
-
 static void open_input_file(struct VpxInputContext *input) {
   /* Parse certain options from the input file, if possible */
-  input->file = strcmp(input->filename, "-")
-      ? fopen(input->filename, "rb") : set_binary_mode(stdin);
+  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+                                             : set_binary_mode(stdin);
 
-  if (!input->file)
-    fatal("Failed to open input file");
+  if (!input->file) fatal("Failed to open input file");
 
   if (!fseeko(input->file, 0, SEEK_END)) {
     /* Input file is seekable. Figure out how long it is, so we can get
@@ -953,8 +972,7 @@ static void open_input_file(struct VpxInputContext *input) {
   input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
   input->detect.position = 0;
 
-  if (input->detect.buf_read == 4
-      && file_is_y4m(input->detect.buf)) {
+  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
     if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
                        input->only_i420) >= 0) {
       input->file_type = FILE_TYPE_Y4M;
@@ -975,11 +993,9 @@ static void open_input_file(struct VpxInputContext *input) {
   }
 }
 
-
 static void close_input_file(struct VpxInputContext *input) {
   fclose(input->file);
-  if (input->file_type == FILE_TYPE_Y4M)
-    y4m_input_close(&input->y4m);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
 }
 
 static struct stream_state *new_stream(struct VpxEncoderConfig *global,
@@ -996,14 +1012,12 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
     stream->index++;
     prev->next = stream;
   } else {
-    vpx_codec_err_t  res;
+    vpx_codec_err_t res;
 
     /* Populate encoder configuration */
     res = vpx_codec_enc_config_default(global->codec->codec_interface(),
-                                       &stream->config.cfg,
-                                       global->usage);
-    if (res)
-      fatal("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+                                       &stream->config.cfg, global->usage);
+    if (res) fatal("Failed to get config: %s\n", vpx_codec_err_to_string(res));
 
     /* Change the default timebase to a high enough value so that the
      * encoder will always create strictly increasing timestamps.
@@ -1040,18 +1054,16 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
   return stream;
 }
 
-
 static int parse_stream_params(struct VpxEncoderConfig *global,
-                               struct stream_state  *stream,
-                               char **argv) {
-  char                   **argi, **argj;
-  struct arg               arg;
+                               struct stream_state *stream, char **argv) {
+  char **argi, **argj;
+  struct arg arg;
   static const arg_def_t **ctrl_args = no_args;
-  static const int        *ctrl_args_map = NULL;
-  struct stream_config    *config = &stream->config;
-  int                      eos_mark_found = 0;
+  static const int *ctrl_args_map = NULL;
+  struct stream_config *config = &stream->config;
+  int eos_mark_found = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
-  int                      test_16bit_internal = 0;
+  int test_16bit_internal = 0;
 #endif
 
   // Handle codec specific options
@@ -1156,7 +1168,7 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
     } else if (arg_match(&arg, &buf_optimal_sz, argi)) {
       config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &bias_pct, argi)) {
-        config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
+      config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &minsection_pct, argi)) {
@@ -1202,43 +1214,41 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
           if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
             config->arg_ctrls[j][0] = ctrl_args_map[i];
             config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
-            if (j == config->arg_ctrl_cnt)
-              config->arg_ctrl_cnt++;
+            if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
           }
         }
       }
-      if (!match)
-        argj++;
+      if (!match) argj++;
     }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (strcmp(global->codec->name, "vp9") == 0 ||
       strcmp(global->codec->name, "vp10") == 0) {
-    config->use_16bit_internal = test_16bit_internal |
-                                 (config->cfg.g_profile > 1);
+    config->use_16bit_internal =
+        test_16bit_internal | (config->cfg.g_profile > 1);
   }
 #endif
   return eos_mark_found;
 }
 
-
-#define FOREACH_STREAM(func) \
-  do { \
-    struct stream_state *stream; \
+#define FOREACH_STREAM(func)                                \
+  do {                                                      \
+    struct stream_state *stream;                            \
     for (stream = streams; stream; stream = stream->next) { \
-      func; \
-    } \
+      func;                                                 \
+    }                                                       \
   } while (0)
 
-
 static void validate_stream_config(const struct stream_state *stream,
                                    const struct VpxEncoderConfig *global) {
   const struct stream_state *streami;
   (void)global;
 
   if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
-    fatal("Stream %d: Specify stream dimensions with --width (-w) "
-          " and --height (-h)", stream->index);
+    fatal(
+        "Stream %d: Specify stream dimensions with --width (-w) "
+        " and --height (-h)",
+        stream->index);
 
   // Check that the codec bit depth is greater than the input bit depth.
   if (stream->config.cfg.g_input_bit_depth >
@@ -1285,9 +1295,7 @@ static void validate_stream_config(const struct stream_state *stream,
   }
 }
 
-
-static void set_stream_dimensions(struct stream_state *stream,
-                                  unsigned int w,
+static void set_stream_dimensions(struct stream_state *stream, unsigned int w,
                                   unsigned int h) {
   if (!stream->config.cfg.g_w) {
     if (!stream->config.cfg.g_h)
@@ -1300,7 +1308,7 @@ static void set_stream_dimensions(struct stream_state *stream,
   }
 }
 
-static const char* file_type_to_string(enum VideoFileType t) {
+static const char *file_type_to_string(enum VideoFileType t) {
   switch (t) {
     case FILE_TYPE_RAW: return "RAW";
     case FILE_TYPE_Y4M: return "Y4M";
@@ -1308,7 +1316,7 @@ static const char* file_type_to_string(enum VideoFileType t) {
   }
 }
 
-static const char* image_format_to_string(vpx_img_fmt_t f) {
+static const char *image_format_to_string(vpx_img_fmt_t f) {
   switch (f) {
     case VPX_IMG_FMT_I420: return "I420";
     case VPX_IMG_FMT_I422: return "I422";
@@ -1326,7 +1334,6 @@ static const char* image_format_to_string(vpx_img_fmt_t f) {
 static void show_stream_config(struct stream_state *stream,
                                struct VpxEncoderConfig *global,
                                struct VpxInputContext *input) {
-
 #define SHOW(field) \
   fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
 
@@ -1334,8 +1341,7 @@ static void show_stream_config(struct stream_state *stream,
     fprintf(stderr, "Codec: %s\n",
             vpx_codec_iface_name(global->codec->codec_interface()));
     fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
-            input->filename,
-            file_type_to_string(input->file_type),
+            input->filename, file_type_to_string(input->file_type),
             image_format_to_string(input->fmt));
   }
   if (stream->next || stream->index)
@@ -1378,20 +1384,17 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(kf_max_dist);
 }
 
-
 static void open_output_file(struct stream_state *stream,
                              struct VpxEncoderConfig *global,
                              const struct VpxRational *pixel_aspect_ratio) {
   const char *fn = stream->config.out_fn;
   const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
 
-  if (cfg->g_pass == VPX_RC_FIRST_PASS)
-    return;
+  if (cfg->g_pass == VPX_RC_FIRST_PASS) return;
 
   stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
 
-  if (!stream->file)
-    fatal("Failed to open output file");
+  if (!stream->file) fatal("Failed to open output file");
 
   if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR))
     fatal("WebM output to pipes not supported.");
@@ -1399,10 +1402,8 @@ static void open_output_file(struct stream_state *stream,
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
     stream->webm_ctx.stream = stream->file;
-    write_webm_file_header(&stream->webm_ctx, cfg,
-                           &global->framerate,
-                           stream->config.stereo_fmt,
-                           global->codec->fourcc,
+    write_webm_file_header(&stream->webm_ctx, cfg, &global->framerate,
+                           stream->config.stereo_fmt, global->codec->fourcc,
                            pixel_aspect_ratio);
   }
 #else
@@ -1414,13 +1415,11 @@ static void open_output_file(struct stream_state *stream,
   }
 }
 
-
 static void close_output_file(struct stream_state *stream,
                               unsigned int fourcc) {
   const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg;
 
-  if (cfg->g_pass == VPX_RC_FIRST_PASS)
-    return;
+  if (cfg->g_pass == VPX_RC_FIRST_PASS) return;
 
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
@@ -1430,21 +1429,17 @@ static void close_output_file(struct stream_state *stream,
 
   if (!stream->config.write_webm) {
     if (!fseek(stream->file, 0, SEEK_SET))
-      ivf_write_file_header(stream->file, &stream->config.cfg,
-                            fourcc,
+      ivf_write_file_header(stream->file, &stream->config.cfg, fourcc,
                             stream->frames_out);
   }
 
   fclose(stream->file);
 }
 
-
 static void setup_pass(struct stream_state *stream,
-                       struct VpxEncoderConfig *global,
-                       int pass) {
+                       struct VpxEncoderConfig *global, int pass) {
   if (stream->config.stats_fn) {
-    if (!stats_open_file(&stream->stats, stream->config.stats_fn,
-                         pass))
+    if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass))
       fatal("Failed to open statistics store");
   } else {
     if (!stats_open_mem(&stream->stats, pass))
@@ -1453,8 +1448,8 @@ static void setup_pass(struct stream_state *stream,
 
 #if CONFIG_FP_MB_STATS
   if (stream->config.fpmb_stats_fn) {
-    if (!stats_open_file(&stream->fpmb_stats,
-                         stream->config.fpmb_stats_fn, pass))
+    if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
+                         pass))
       fatal("Failed to open mb statistics store");
   } else {
     if (!stats_open_mem(&stream->fpmb_stats, pass))
@@ -1463,8 +1458,8 @@ static void setup_pass(struct stream_state *stream,
 #endif
 
   stream->config.cfg.g_pass = global->passes == 2
-                              ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
-                            : VPX_RC_ONE_PASS;
+                                  ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
+                                  : VPX_RC_ONE_PASS;
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
 #if CONFIG_FP_MB_STATS
@@ -1478,7 +1473,6 @@ static void setup_pass(struct stream_state *stream,
   stream->frames_out = 0;
 }
 
-
 static void initialize_encoder(struct stream_state *stream,
                                struct VpxEncoderConfig *global) {
   int i;
@@ -1503,8 +1497,7 @@ static void initialize_encoder(struct stream_state *stream,
     int ctrl = stream->config.arg_ctrls[i][0];
     int value = stream->config.arg_ctrls[i][1];
     if (vpx_codec_control_(&stream->encoder, ctrl, value))
-      fprintf(stderr, "Error: Tried to set control %d = %d\n",
-              ctrl, value);
+      fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value);
 
     ctx_exit_on_error(&stream->encoder, "Failed to control codec");
   }
@@ -1512,7 +1505,7 @@ static void initialize_encoder(struct stream_state *stream,
 #if CONFIG_DECODERS
   if (global->test_decode != TEST_DECODE_OFF) {
     const VpxInterface *decoder = get_vpx_decoder_by_name(global->codec->name);
-    vpx_codec_dec_cfg_t cfg = { 0, 0, 0};
+    vpx_codec_dec_cfg_t cfg = { 0, 0, 0 };
     vpx_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
 
 #if CONFIG_VP10_DECODER && CONFIG_EXT_TILE
@@ -1528,23 +1521,21 @@ static void initialize_encoder(struct stream_state *stream,
 #endif
 }
 
-
 static void encode_frame(struct stream_state *stream,
-                         struct VpxEncoderConfig *global,
-                         struct vpx_image *img,
+                         struct VpxEncoderConfig *global, struct vpx_image *img,
                          unsigned int frames_in) {
   vpx_codec_pts_t frame_start, next_frame_start;
   struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
   struct vpx_usec_timer timer;
 
-  frame_start = (cfg->g_timebase.den * (int64_t)(frames_in - 1)
-                 * global->framerate.den)
-                / cfg->g_timebase.num / global->framerate.num;
-  next_frame_start = (cfg->g_timebase.den * (int64_t)(frames_in)
-                      * global->framerate.den)
-                     / cfg->g_timebase.num / global->framerate.num;
+  frame_start =
+      (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) /
+      cfg->g_timebase.num / global->framerate.num;
+  next_frame_start =
+      (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) /
+      cfg->g_timebase.num / global->framerate.num;
 
-  /* Scale if necessary */
+/* Scale if necessary */
 #if CONFIG_VP9_HIGHBITDEPTH
   if (img) {
     if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) &&
@@ -1555,32 +1546,28 @@ static void encode_frame(struct stream_state *stream,
       }
 #if CONFIG_LIBYUV
       if (!stream->img) {
-        stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016,
-                                    cfg->g_w, cfg->g_h, 16);
+        stream->img =
+            vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
       }
-      I420Scale_16((uint16*)img->planes[VPX_PLANE_Y],
-                   img->stride[VPX_PLANE_Y]/2,
-                   (uint16*)img->planes[VPX_PLANE_U],
-                   img->stride[VPX_PLANE_U]/2,
-                   (uint16*)img->planes[VPX_PLANE_V],
-                   img->stride[VPX_PLANE_V]/2,
-                   img->d_w, img->d_h,
-                   (uint16*)stream->img->planes[VPX_PLANE_Y],
-                   stream->img->stride[VPX_PLANE_Y]/2,
-                   (uint16*)stream->img->planes[VPX_PLANE_U],
-                   stream->img->stride[VPX_PLANE_U]/2,
-                   (uint16*)stream->img->planes[VPX_PLANE_V],
-                   stream->img->stride[VPX_PLANE_V]/2,
-                   stream->img->d_w, stream->img->d_h,
-                   kFilterBox);
+      I420Scale_16(
+          (uint16 *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2,
+          (uint16 *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2,
+          (uint16 *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2,
+          img->d_w, img->d_h, (uint16 *)stream->img->planes[VPX_PLANE_Y],
+          stream->img->stride[VPX_PLANE_Y] / 2,
+          (uint16 *)stream->img->planes[VPX_PLANE_U],
+          stream->img->stride[VPX_PLANE_U] / 2,
+          (uint16 *)stream->img->planes[VPX_PLANE_V],
+          stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w,
+          stream->img->d_h, kFilterBox);
       img = stream->img;
 #else
-    stream->encoder.err = 1;
-    ctx_exit_on_error(&stream->encoder,
-                      "Stream %d: Failed to encode frame.\n"
-                      "Scaling disabled in this configuration. \n"
-                      "To enable, configure with --enable-libyuv\n",
-                      stream->index);
+      stream->encoder.err = 1;
+      ctx_exit_on_error(&stream->encoder,
+                        "Stream %d: Failed to encode frame.\n"
+                        "Scaling disabled in this configuration. \n"
+                        "To enable, configure with --enable-libyuv\n",
+                        stream->index);
 #endif
     }
   }
@@ -1592,20 +1579,16 @@ static void encode_frame(struct stream_state *stream,
     }
 #if CONFIG_LIBYUV
     if (!stream->img)
-      stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
-                                  cfg->g_w, cfg->g_h, 16);
-    I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
-              img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
-              img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
-              img->d_w, img->d_h,
-              stream->img->planes[VPX_PLANE_Y],
-              stream->img->stride[VPX_PLANE_Y],
-              stream->img->planes[VPX_PLANE_U],
-              stream->img->stride[VPX_PLANE_U],
-              stream->img->planes[VPX_PLANE_V],
-              stream->img->stride[VPX_PLANE_V],
-              stream->img->d_w, stream->img->d_h,
-              kFilterBox);
+      stream->img =
+          vpx_img_alloc(NULL, VPX_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16);
+    I420Scale(
+        img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
+        img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
+        img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V], img->d_w, img->d_h,
+        stream->img->planes[VPX_PLANE_Y], stream->img->stride[VPX_PLANE_Y],
+        stream->img->planes[VPX_PLANE_U], stream->img->stride[VPX_PLANE_U],
+        stream->img->planes[VPX_PLANE_V], stream->img->stride[VPX_PLANE_V],
+        stream->img->d_w, stream->img->d_h, kFilterBox);
     img = stream->img;
 #else
     stream->encoder.err = 1;
@@ -1619,15 +1602,14 @@ static void encode_frame(struct stream_state *stream,
 
   vpx_usec_timer_start(&timer);
   vpx_codec_encode(&stream->encoder, img, frame_start,
-                   (unsigned long)(next_frame_start - frame_start),
-                   0, global->deadline);
+                   (unsigned long)(next_frame_start - frame_start), 0,
+                   global->deadline);
   vpx_usec_timer_mark(&timer);
   stream->cx_time += vpx_usec_timer_elapsed(&timer);
   ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame",
                     stream->index);
 }
 
-
 static void update_quantizer_histogram(struct stream_state *stream) {
   if (stream->config.cfg.g_pass != VPX_RC_FIRST_PASS) {
     int q;
@@ -1638,10 +1620,8 @@ static void update_quantizer_histogram(struct stream_state *stream) {
   }
 }
 
-
 static void get_cx_data(struct stream_state *stream,
-                        struct VpxEncoderConfig *global,
-                        int *got_data) {
+                        struct VpxEncoderConfig *global, int *got_data) {
   const vpx_codec_cx_pkt_t *pkt;
   const struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
   vpx_codec_iter_t iter = NULL;
@@ -1682,8 +1662,8 @@ static void get_cx_data(struct stream_state *stream,
             }
           }
 
-          (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-                        stream->file);
+          (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                       stream->file);
         }
         stream->nbytes += pkt->data.raw.sz;
 
@@ -1704,15 +1684,13 @@ static void get_cx_data(struct stream_state *stream,
         break;
       case VPX_CODEC_STATS_PKT:
         stream->frames_out++;
-        stats_write(&stream->stats,
-                    pkt->data.twopass_stats.buf,
+        stats_write(&stream->stats, pkt->data.twopass_stats.buf,
                     pkt->data.twopass_stats.sz);
         stream->nbytes += pkt->data.raw.sz;
         break;
 #if CONFIG_FP_MB_STATS
       case VPX_CODEC_FPMB_STATS_PKT:
-        stats_write(&stream->fpmb_stats,
-                    pkt->data.firstpass_mb_stats.buf,
+        stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
                     pkt->data.firstpass_mb_stats.sz);
         stream->nbytes += pkt->data.raw.sz;
         break;
@@ -1733,19 +1711,16 @@ static void get_cx_data(struct stream_state *stream,
         }
 
         break;
-      default:
-        break;
+      default: break;
     }
   }
 }
 
-
-static void show_psnr(struct stream_state  *stream, double peak) {
+static void show_psnr(struct stream_state *stream, double peak) {
   int i;
   double ovpsnr;
 
-  if (!stream->psnr_count)
-    return;
+  if (!stream->psnr_count) return;
 
   fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
   ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
@@ -1758,18 +1733,16 @@ static void show_psnr(struct stream_state  *stream, double peak) {
   fprintf(stderr, "\n");
 }
 
-
 static float usec_to_fps(uint64_t usec, unsigned int frames) {
   return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
 }
 
-static void test_decode(struct stream_state  *stream,
+static void test_decode(struct stream_state *stream,
                         enum TestDecodeFatality fatal,
                         const VpxInterface *codec) {
   vpx_image_t enc_img, dec_img;
 
-  if (stream->mismatch_seen)
-    return;
+  if (stream->mismatch_seen) return;
 
   /* Get the internal reference frame */
   if (strcmp(codec->name, "vp8") == 0) {
@@ -1831,10 +1804,8 @@ static void test_decode(struct stream_state  *stream,
                           " Y[%d, %d] {%d/%d},"
                           " U[%d, %d] {%d/%d},"
                           " V[%d, %d] {%d/%d}",
-                          stream->index, stream->frames_out,
-                          y[0], y[1], y[2], y[3],
-                          u[0], u[1], u[2], u[3],
-                          v[0], v[1], v[2], v[3]);
+                          stream->index, stream->frames_out, y[0], y[1], y[2],
+                          y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]);
     stream->mismatch_seen = stream->frames_out;
   }
 
@@ -1842,7 +1813,6 @@ static void test_decode(struct stream_state  *stream,
   vpx_img_free(&dec_img);
 }
 
-
 static void print_time(const char *label, int64_t etl) {
   int64_t hours;
   int64_t mins;
@@ -1855,14 +1825,13 @@ static void print_time(const char *label, int64_t etl) {
     etl -= mins * 60;
     secs = etl;
 
-    fprintf(stderr, "[%3s %2"PRId64":%02"PRId64":%02"PRId64"] ",
-            label, hours, mins, secs);
+    fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label,
+            hours, mins, secs);
   } else {
     fprintf(stderr, "[%3s  unknown] ", label);
   }
 }
 
-
 int main(int argc, const char **argv_) {
   int pass;
   vpx_image_t raw;
@@ -1885,8 +1854,7 @@ int main(int argc, const char **argv_) {
   memset(&input, 0, sizeof(input));
   exec_name = argv_[0];
 
-  if (argc < 3)
-    usage_exit();
+  if (argc < 3) usage_exit();
 
   /* Setup default input stream settings */
   input.framerate.numerator = 30;
@@ -1902,21 +1870,11 @@ int main(int argc, const char **argv_) {
   parse_global_config(&global, argv);
 
   switch (global.color_type) {
-    case I420:
-      input.fmt = VPX_IMG_FMT_I420;
-      break;
-    case I422:
-      input.fmt = VPX_IMG_FMT_I422;
-      break;
-    case I444:
-      input.fmt = VPX_IMG_FMT_I444;
-      break;
-    case I440:
-      input.fmt = VPX_IMG_FMT_I440;
-      break;
-    case YV12:
-      input.fmt = VPX_IMG_FMT_YV12;
-      break;
+    case I420: input.fmt = VPX_IMG_FMT_I420; break;
+    case I422: input.fmt = VPX_IMG_FMT_I422; break;
+    case I444: input.fmt = VPX_IMG_FMT_I444; break;
+    case I440: input.fmt = VPX_IMG_FMT_I440; break;
+    case YV12: input.fmt = VPX_IMG_FMT_YV12; break;
   }
 
   {
@@ -1929,8 +1887,7 @@ int main(int argc, const char **argv_) {
     do {
       stream = new_stream(&global, stream);
       stream_cnt++;
-      if (!streams)
-        streams = stream;
+      if (!streams) streams = stream;
     } while (parse_stream_params(&global, stream, argv));
   }
 
@@ -1939,14 +1896,13 @@ int main(int argc, const char **argv_) {
     if (argi[0][0] == '-' && argi[0][1])
       die("Error: Unrecognized option %s\n", *argi);
 
-  FOREACH_STREAM(check_encoder_config(global.disable_warning_prompt,
-                                      &global, &stream->config.cfg););
+  FOREACH_STREAM(check_encoder_config(global.disable_warning_prompt, &global,
+                                      &stream->config.cfg););
 
   /* Handle non-option arguments */
   input.filename = argv[0];
 
-  if (!input.filename)
-    usage_exit();
+  if (!input.filename) usage_exit();
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
   if (global.codec->fourcc == VP9_FOURCC || global.codec->fourcc == VP10_FOURCC)
@@ -1975,8 +1931,9 @@ int main(int argc, const char **argv_) {
 
     /* Update stream configurations from the input file's parameters */
     if (!input.width || !input.height)
-      fatal("Specify stream dimensions with --width (-w) "
-            " and --height (-h)");
+      fatal(
+          "Specify stream dimensions with --width (-w) "
+          " and --height (-h)");
 
     /* If input file does not specify bit-depth but input-bit-depth parameter
      * exists, assume that to be the input bit-depth. However, if the
@@ -1993,9 +1950,8 @@ int main(int argc, const char **argv_) {
       });
       if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
     } else {
-      FOREACH_STREAM({
-        stream->config.cfg.g_input_bit_depth = input.bit_depth;
-      });
+      FOREACH_STREAM(
+          { stream->config.cfg.g_input_bit_depth = input.bit_depth; });
     }
 
     FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
@@ -2005,18 +1961,21 @@ int main(int argc, const char **argv_) {
      * --passes=2, ensure --fpf was set.
      */
     if (global.pass && global.passes == 2)
-      FOREACH_STREAM( {
-      if (!stream->config.stats_fn)
-        die("Stream %d: Must specify --fpf when --pass=%d"
-        " and --passes=2\n", stream->index, global.pass);
-    });
+      FOREACH_STREAM({
+        if (!stream->config.stats_fn)
+          die(
+              "Stream %d: Must specify --fpf when --pass=%d"
+              " and --passes=2\n",
+              stream->index, global.pass);
+      });
 
 #if !CONFIG_WEBM_IO
     FOREACH_STREAM({
       if (stream->config.write_webm) {
         stream->config.write_webm = 0;
-        warn("vpxenc was compiled without WebM container support."
-             "Producing IVF output");
+        warn(
+            "vpxenc was compiled without WebM container support."
+            "Producing IVF output");
       }
     });
 #endif
@@ -2044,14 +2003,13 @@ int main(int argc, const char **argv_) {
       else
         vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32);
 
-      FOREACH_STREAM(stream->rate_hist =
-                         init_rate_histogram(&stream->config.cfg,
-                                             &global.framerate));
+      FOREACH_STREAM(stream->rate_hist = init_rate_histogram(
+                         &stream->config.cfg, &global.framerate));
     }
 
     FOREACH_STREAM(setup_pass(stream, &global, pass));
-    FOREACH_STREAM(open_output_file(stream, &global,
-                                    &input.pixel_aspect_ratio));
+    FOREACH_STREAM(
+        open_output_file(stream, &global, &input.pixel_aspect_ratio));
     FOREACH_STREAM(initialize_encoder(stream, &global));
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -2068,7 +2026,7 @@ int main(int argc, const char **argv_) {
           input_shift = 0;
         } else {
           input_shift = (int)stream->config.cfg.g_bit_depth -
-              stream->config.cfg.g_input_bit_depth;
+                        stream->config.cfg.g_input_bit_depth;
         }
       });
     }
@@ -2083,26 +2041,23 @@ int main(int argc, const char **argv_) {
       if (!global.limit || frames_in < global.limit) {
         frame_avail = read_frame(&input, &raw);
 
-        if (frame_avail)
-          frames_in++;
-        seen_frames = frames_in > global.skip_frames ?
-                          frames_in - global.skip_frames : 0;
+        if (frame_avail) frames_in++;
+        seen_frames =
+            frames_in > global.skip_frames ? frames_in - global.skip_frames : 0;
 
         if (!global.quiet) {
           float fps = usec_to_fps(cx_time, seen_frames);
           fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes);
 
           if (stream_cnt == 1)
-            fprintf(stderr,
-                    "frame %4d/%-4d %7"PRId64"B ",
-                    frames_in, streams->frames_out, (int64_t)streams->nbytes);
+            fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in,
+                    streams->frames_out, (int64_t)streams->nbytes);
           else
             fprintf(stderr, "frame %4d ", frames_in);
 
-          fprintf(stderr, "%7"PRId64" %s %.2f %s ",
+          fprintf(stderr, "%7" PRId64 " %s %.2f %s ",
                   cx_time > 9999999 ? cx_time / 1000 : cx_time,
-                  cx_time > 9999999 ? "ms" : "us",
-                  fps >= 1.0 ? fps : fps * 60,
+                  cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60,
                   fps >= 1.0 ? "fps" : "fpm");
           print_time("ETA", estimated_time_left);
         }
@@ -2133,8 +2088,7 @@ int main(int argc, const char **argv_) {
           FOREACH_STREAM({
             if (stream->config.use_16bit_internal)
               encode_frame(stream, &global,
-                           frame_avail ? frame_to_encode : NULL,
-                           frames_in);
+                           frame_avail ? frame_to_encode : NULL, frames_in);
             else
               assert(0);
           });
@@ -2146,8 +2100,7 @@ int main(int argc, const char **argv_) {
         }
 #else
         vpx_usec_timer_start(&timer);
-        FOREACH_STREAM(encode_frame(stream, &global,
-                                    frame_avail ? &raw : NULL,
+        FOREACH_STREAM(encode_frame(stream, &global, frame_avail ? &raw : NULL,
                                     frames_in));
 #endif
         vpx_usec_timer_mark(&timer);
@@ -2169,8 +2122,8 @@ int main(int argc, const char **argv_) {
             const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000;
 
             rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0;
-            remaining = 1000 * (global.limit - global.skip_frames
-                                - seen_frames + lagged_count);
+            remaining = 1000 * (global.limit - global.skip_frames -
+                                seen_frames + lagged_count);
           } else {
             const int64_t input_pos = ftello(input.file);
             const int64_t input_pos_lagged = input_pos - lagged_count;
@@ -2180,9 +2133,8 @@ int main(int argc, const char **argv_) {
             remaining = limit - input_pos + lagged_count;
           }
 
-          average_rate = (average_rate <= 0)
-              ? rate
-              : (average_rate * 7 + rate) / 8;
+          average_rate =
+              (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8;
           estimated_time_left = average_rate ? remaining / average_rate : -1;
         }
 
@@ -2191,23 +2143,23 @@ int main(int argc, const char **argv_) {
       }
 
       fflush(stdout);
-      if (!global.quiet)
-        fprintf(stderr, "\033[K");
+      if (!global.quiet) fprintf(stderr, "\033[K");
     }
 
-    if (stream_cnt > 1)
-      fprintf(stderr, "\n");
+    if (stream_cnt > 1) fprintf(stderr, "\n");
 
     if (!global.quiet) {
-      FOREACH_STREAM(fprintf(stderr,
-          "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7"PRId64"b/f %7"PRId64"b/s"
-          " %7"PRId64" %s (%.2f fps)\033[K\n",
-          pass + 1,
-          global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes,
+      FOREACH_STREAM(fprintf(
+          stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
+                  "b/f %7" PRId64 "b/s"
+                  " %7" PRId64 " %s (%.2f fps)\033[K\n",
+          pass + 1, global.passes, frames_in, stream->frames_out,
+          (int64_t)stream->nbytes,
           seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
-          seen_frames ? (int64_t)stream->nbytes * 8 *
-              (int64_t)global.framerate.num / global.framerate.den /
-              seen_frames : 0,
+          seen_frames
+              ? (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num /
+                    global.framerate.den / seen_frames
+              : 0,
           stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time,
           stream->cx_time > 9999999 ? "ms" : "us",
           usec_to_fps(stream->cx_time, seen_frames)));
@@ -2242,17 +2194,15 @@ int main(int argc, const char **argv_) {
     FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1));
 #endif
 
-    if (global.pass)
-      break;
+    if (global.pass) break;
   }
 
   if (global.show_q_hist_buckets)
-    FOREACH_STREAM(show_q_histogram(stream->counts,
-                                    global.show_q_hist_buckets));
+    FOREACH_STREAM(
+        show_q_histogram(stream->counts, global.show_q_hist_buckets));
 
   if (global.show_rate_hist_buckets)
-    FOREACH_STREAM(show_rate_histogram(stream->rate_hist,
-                                       &stream->config.cfg,
+    FOREACH_STREAM(show_rate_histogram(stream->rate_hist, &stream->config.cfg,
                                        global.show_rate_hist_buckets));
   FOREACH_STREAM(destroy_rate_histogram(stream->rate_hist));
 
@@ -2274,8 +2224,7 @@ int main(int argc, const char **argv_) {
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (allocated_raw_shift)
-    vpx_img_free(&raw_shift);
+  if (allocated_raw_shift) vpx_img_free(&raw_shift);
 #endif
   vpx_img_free(&raw);
   free(argv);
diff --git a/vpxstats.c b/vpxstats.c
index 16728ce09637d33614457ae20b847133ca3ab8db..142e367bb48c440fc7873722cdff8cbfdaaca8d5 100644
--- a/vpxstats.c
+++ b/vpxstats.c
@@ -30,8 +30,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
 
     stats->file = fopen(fpf, "rb");
 
-    if (stats->file == NULL)
-      fatal("First-pass stats file does not exist!");
+    if (stats->file == NULL) fatal("First-pass stats file does not exist!");
 
     if (fseek(stats->file, 0, SEEK_END))
       fatal("First-pass stats file must be seekable!");
@@ -76,18 +75,17 @@ void stats_close(stats_io_t *stats, int last_pass) {
     fclose(stats->file);
     stats->file = NULL;
   } else {
-    if (stats->pass == last_pass)
-      free(stats->buf.buf);
+    if (stats->pass == last_pass) free(stats->buf.buf);
   }
 }
 
 void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
   if (stats->file) {
-    (void) fwrite(pkt, 1, len, stats->file);
+    (void)fwrite(pkt, 1, len, stats->file);
   } else {
     if (stats->buf.sz + len > stats->buf_alloc_sz) {
-      size_t  new_sz = stats->buf_alloc_sz + 64 * 1024;
-      char   *new_ptr = realloc(stats->buf.buf, new_sz);
+      size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
+      char *new_ptr = realloc(stats->buf.buf, new_sz);
 
       if (new_ptr) {
         stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
@@ -104,6 +102,4 @@ void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
   }
 }
 
-vpx_fixed_buf_t stats_get(stats_io_t *stats) {
-  return stats->buf;
-}
+vpx_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; }
diff --git a/warnings.c b/warnings.c
index 7ac678ab4aabfee4a94ce7916f0580f1205c5b25..a3e4926674b00eb434a478835dbaac2219034766 100644
--- a/warnings.c
+++ b/warnings.c
@@ -47,8 +47,7 @@ static void add_warning(const char *warning_string,
   new_node->warning_string = warning_string;
   new_node->next_warning = NULL;
 
-  while (*node != NULL)
-    node = &(*node)->next_warning;
+  while (*node != NULL) node = &(*node)->next_warning;
 
   *node = new_node;
 }
@@ -78,9 +77,7 @@ static void check_quantizer(int min_q, int max_q,
 }
 
 static void check_lag_in_frames_realtime_deadline(
-    int lag_in_frames,
-    int deadline,
-    struct WarningList *warning_list) {
+    int lag_in_frames, int deadline, struct WarningList *warning_list) {
   if (deadline == VPX_DL_REALTIME && lag_in_frames != 0)
     add_warning(lag_in_frames_with_realtime, warning_list);
 }
@@ -90,26 +87,21 @@ void check_encoder_config(int disable_prompt,
                           const struct vpx_codec_enc_cfg *stream_config) {
   int num_warnings = 0;
   struct WarningListNode *warning = NULL;
-  struct WarningList warning_list = {0};
+  struct WarningList warning_list = { 0 };
 
   check_quantizer(stream_config->rc_min_quantizer,
-                  stream_config->rc_max_quantizer,
-                  &warning_list);
+                  stream_config->rc_max_quantizer, &warning_list);
   check_lag_in_frames_realtime_deadline(stream_config->g_lag_in_frames,
-                                        global_config->deadline,
-                                        &warning_list);
+                                        global_config->deadline, &warning_list);
   /* Count and print warnings. */
-  for (warning = warning_list.warning_node;
-       warning != NULL;
-       warning = warning->next_warning,
-       ++num_warnings) {
+  for (warning = warning_list.warning_node; warning != NULL;
+       warning = warning->next_warning, ++num_warnings) {
     warn(warning->warning_string);
   }
 
   free_warning_list(&warning_list);
 
   if (num_warnings) {
-    if (!disable_prompt && !continue_prompt(num_warnings))
-      exit(EXIT_FAILURE);
+    if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE);
   }
 }
diff --git a/webmdec.h b/webmdec.h
index aa371f32122d88099186979a89f9139cea5d9de9..7dcb170caf3c0f1d394bb5e24764e619deb5edc6 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -52,8 +52,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
 //      0 - Success
 //      1 - End of Stream
 //     -1 - Error
-int webm_read_frame(struct WebmInputContext *webm_ctx,
-                    uint8_t **buffer,
+int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
                     size_t *buffer_size);
 
 // Guesses the frame rate of the input file based on the container timestamps.
diff --git a/webmenc.h b/webmenc.h
index ad30664e31a8b6228102e2ad132211e2a203370d..1ae7786cd773ca9cce942fc1a69635e1fce85607 100644
--- a/webmenc.h
+++ b/webmenc.h
@@ -40,8 +40,7 @@ typedef enum stereo_format {
 void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const vpx_codec_enc_cfg_t *cfg,
                             const struct vpx_rational *fps,
-                            stereo_format_t stereo_fmt,
-                            unsigned int fourcc,
+                            stereo_format_t stereo_fmt, unsigned int fourcc,
                             const struct VpxRational *par);
 
 void write_webm_block(struct WebmOutputContext *webm_ctx,
diff --git a/y4menc.c b/y4menc.c
index b647e8dcc5e891a3e57ea637be4dd40102e081c4..e26fcaf6ea30885dbdc01a5c9902817181473d95 100644
--- a/y4menc.c
+++ b/y4menc.c
@@ -17,39 +17,43 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
   const char *color;
   switch (bit_depth) {
     case 8:
-      color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
-              fmt == VPX_IMG_FMT_I444 ? "C444\n" :
-              fmt == VPX_IMG_FMT_I422 ? "C422\n" :
-              "C420jpeg\n";
+      color = fmt == VPX_IMG_FMT_444A
+                  ? "C444alpha\n"
+                  : fmt == VPX_IMG_FMT_I444 ? "C444\n" : fmt == VPX_IMG_FMT_I422
+                                                             ? "C422\n"
+                                                             : "C420jpeg\n";
       break;
     case 9:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" :
-              "C420p9 XYSCSS=420P9\n";
+      color = fmt == VPX_IMG_FMT_I44416
+                  ? "C444p9 XYSCSS=444P9\n"
+                  : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
+                                              : "C420p9 XYSCSS=420P9\n";
       break;
     case 10:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" :
-              "C420p10 XYSCSS=420P10\n";
+      color = fmt == VPX_IMG_FMT_I44416
+                  ? "C444p10 XYSCSS=444P10\n"
+                  : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
+                                              : "C420p10 XYSCSS=420P10\n";
       break;
     case 12:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" :
-              "C420p12 XYSCSS=420P12\n";
+      color = fmt == VPX_IMG_FMT_I44416
+                  ? "C444p12 XYSCSS=444P12\n"
+                  : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
+                                              : "C420p12 XYSCSS=420P12\n";
       break;
     case 14:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" :
-              "C420p14 XYSCSS=420P14\n";
+      color = fmt == VPX_IMG_FMT_I44416
+                  ? "C444p14 XYSCSS=444P14\n"
+                  : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
+                                              : "C420p14 XYSCSS=420P14\n";
       break;
     case 16:
-      color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" :
-              fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" :
-              "C420p16 XYSCSS=420P16\n";
+      color = fmt == VPX_IMG_FMT_I44416
+                  ? "C444p16 XYSCSS=444P16\n"
+                  : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
+                                              : "C420p16 XYSCSS=420P16\n";
       break;
-    default:
-      color = NULL;
-      assert(0);
+    default: color = NULL; assert(0);
   }
   return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,
                   framerate->numerator, framerate->denominator, 'p', color);
diff --git a/y4minput.c b/y4minput.c
index 2dbd603e8c1ae8c0178cb9ab6499f92c1884c7d7..7de859f7ac1cc09fdfffbba66e334b734e545ec2 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -25,7 +25,7 @@ static int file_read(void *buf, size_t size, FILE *file) {
   int file_error;
   size_t len = 0;
   do {
-    const size_t n = fread((uint8_t*)buf + len, 1, size - len, file);
+    const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
     len += n;
     file_error = ferror(file);
     if (file_error) {
@@ -41,21 +41,22 @@ static int file_read(void *buf, size_t size, FILE *file) {
   } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
 
   if (!feof(file) && len != size) {
-    fprintf(stderr, "Error reading file: %u of %u bytes read,"
-                    " error: %d, retries: %d, %d: %s\n",
-            (uint32_t)len, (uint32_t)size, file_error, retry_count,
-            errno, strerror(errno));
+    fprintf(stderr,
+            "Error reading file: %u of %u bytes read,"
+            " error: %d, retries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+            strerror(errno));
   }
   return len == size;
 }
 
 static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
-  int   got_w;
-  int   got_h;
-  int   got_fps;
-  int   got_interlace;
-  int   got_par;
-  int   got_chroma;
+  int got_w;
+  int got_h;
+  int got_fps;
+  int got_interlace;
+  int got_par;
+  int got_chroma;
   char *p;
   char *q;
   got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
@@ -70,55 +71,47 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
     /*Process the tag.*/
     switch (p[0]) {
       case 'W': {
-        if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1)return -1;
+        if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
         got_w = 1;
-      }
-      break;
+      } break;
       case 'H': {
-        if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1)return -1;
+        if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
         got_h = 1;
-      }
-      break;
+      } break;
       case 'F': {
         if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
           return -1;
         }
         got_fps = 1;
-      }
-      break;
+      } break;
       case 'I': {
         _y4m->interlace = p[1];
         got_interlace = 1;
-      }
-      break;
+      } break;
       case 'A': {
         if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
           return -1;
         }
         got_par = 1;
-      }
-      break;
+      } break;
       case 'C': {
-        if (q - p > 16)return -1;
+        if (q - p > 16) return -1;
         memcpy(_y4m->chroma_type, p + 1, q - p - 1);
         _y4m->chroma_type[q - p - 1] = '\0';
         got_chroma = 1;
-      }
-      break;
-      /*Ignore unknown tags.*/
+      } break;
+        /*Ignore unknown tags.*/
     }
   }
-  if (!got_w || !got_h || !got_fps)return -1;
-  if (!got_interlace)_y4m->interlace = '?';
-  if (!got_par)_y4m->par_n = _y4m->par_d = 0;
+  if (!got_w || !got_h || !got_fps) return -1;
+  if (!got_interlace) _y4m->interlace = '?';
+  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
   /*Chroma-type is not specified in older files, e.g., those generated by
      mplayer.*/
-  if (!got_chroma)strcpy(_y4m->chroma_type, "420");
+  if (!got_chroma) strcpy(_y4m->chroma_type, "420");
   return 0;
 }
 
-
-
 /*All anti-aliasing filters in the following conversion functions are based on
    one of two window functions:
   The 6-tap Lanczos window (for down-sampling and shifts):
@@ -141,9 +134,9 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
    have these steps pipelined, for less memory consumption and better cache
    performance, but we do them separately for simplicity.*/
 
-#define OC_MINI(_a,_b)      ((_a)>(_b)?(_b):(_a))
-#define OC_MAXI(_a,_b)      ((_a)<(_b)?(_b):(_a))
-#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
+#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
+#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
+#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))
 
 /*420jpeg chroma samples are sited like:
   Y-------Y-------Y-------Y-------
@@ -187,25 +180,36 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
    lines, and they are vertically co-sited with the luma samples in both the
    mpeg2 and jpeg cases (thus requiring no vertical resampling).*/
 static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
-                                        const unsigned char *_src, int _c_w, int _c_h) {
+                                        const unsigned char *_src, int _c_w,
+                                        int _c_h) {
   int y;
   int x;
   for (y = 0; y < _c_h; y++) {
     /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos
        window.*/
     for (x = 0; x < OC_MINI(_c_w, 2); x++) {
-      _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] +
-                                             114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] +
-                                             _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> 7, 255);
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+              9 * _src[OC_MINI(x + 2, _c_w - 1)] +
+              _src[OC_MINI(x + 3, _c_w - 1)] + 64) >>
+                 7,
+          255);
     }
     for (; x < _c_w - 3; x++) {
-      _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[x - 2] - 17 * _src[x - 1] +
-                                             114 * _src[x] + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> 7, 255);
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+              35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+                 7,
+          255);
     }
     for (; x < _c_w; x++) {
-      _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[x - 2] - 17 * _src[x - 1] +
-                                             114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] +
-                                             _src[_c_w - 1] + 64) >> 7, 255);
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+              9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+                 7,
+          255);
     }
     _dst += _c_w;
     _src += _c_w;
@@ -278,12 +282,12 @@ static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
 static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
                                          unsigned char *_aux) {
   unsigned char *tmp;
-  int            c_w;
-  int            c_h;
-  int            c_sz;
-  int            pli;
-  int            y;
-  int            x;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int pli;
+  int y;
+  int x;
   /*Skip past the luma data.*/
   _dst += _y4m->pic_w * _y4m->pic_h;
   /*Compute the size of each chroma plane.*/
@@ -303,53 +307,73 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
           This is the same filter used above, but in the other order.*/
         for (x = 0; x < c_w; x++) {
           for (y = 0; y < OC_MINI(c_h, 3); y++) {
-            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[0]
-                                                         - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + 35 * tmp[OC_MAXI(y - 1, 0) * c_w]
-                                                         + 114 * tmp[y * c_w] - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w]
-                                                         + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> 7, 255);
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+                    35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                    4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+                       7,
+                255);
           }
           for (; y < c_h - 2; y++) {
-            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[(y - 3) * c_w]
-                                                         - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w]
-                                                         - 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> 7, 255);
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                    17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+                       7,
+                255);
           }
           for (; y < c_h; y++) {
-            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[(y - 3) * c_w]
-                                                         - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w]
-                                                         - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + 4 * tmp[(c_h - 1) * c_w] + 64) >> 7, 255);
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                    4 * tmp[(c_h - 1) * c_w] + 64) >>
+                       7,
+                255);
           }
           _dst++;
           tmp++;
         }
         _dst += c_sz - c_w;
         tmp -= c_w;
-      }
-      break;
+      } break;
       case 2: {
         /*Slide C_r down a quarter-pel.
           This is the same as the horizontal filter.*/
         for (x = 0; x < c_w; x++) {
           for (y = 0; y < OC_MINI(c_h, 2); y++) {
-            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[0]
-                                                         - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w]
-                                                         + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w]
-                                                         + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> 7, 255);
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+                 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] +
+                 tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >>
+                    7,
+                255);
           }
           for (; y < c_h - 3; y++) {
-            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[(y - 2) * c_w]
-                                                         - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w]
-                                                         - 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> 7, 255);
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                    114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+                    9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+                       7,
+                255);
           }
           for (; y < c_h; y++) {
-            _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[(y - 2) * c_w]
-                                                         - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w]
-                                                         - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + 64) >> 7, 255);
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+                 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] +
+                 64) >>
+                    7,
+                255);
           }
           _dst++;
           tmp++;
         }
-      }
-      break;
+      } break;
     }
     /*For actual interlaced material, this would have to be done separately on
        each field, and the shift amounts would be different.
@@ -364,27 +388,37 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
 /*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0.
   This is used as a helper by several converation routines.*/
 static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
-                                       const unsigned char *_src, int _c_w, int _c_h) {
+                                       const unsigned char *_src, int _c_w,
+                                       int _c_h) {
   int y;
   int x;
   /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
   for (x = 0; x < _c_w; x++) {
     for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
-      _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (64 * _src[0]
-                                          + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w]
-                                          - 17 * _src[OC_MINI(2, _c_h - 1) * _c_w]
-                                          + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> 7, 255);
+      _dst[(y >> 1) * _c_w] =
+          OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+                        17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+                        3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+                           7,
+                    255);
     }
     for (; y < _c_h - 3; y += 2) {
-      _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w])
-                                          - 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w])
-                                          + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> 7, 255);
+      _dst[(y >> 1) * _c_w] =
+          OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+                        17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+                        78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+                           7,
+                    255);
     }
     for (; y < _c_h; y += 2) {
-      _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w]
-                                               + _src[(_c_h - 1) * _c_w]) - 17 * (_src[(y - 1) * _c_w]
-                                                                                  + _src[OC_MINI(y + 2, _c_h - 1) * _c_w])
-                                          + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + 64) >> 7, 255);
+      _dst[(y >> 1) * _c_w] = OC_CLAMPI(
+          0,
+          (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) -
+           17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) +
+           78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) +
+           64) >>
+              7,
+          255);
     }
     _src++;
     _dst++;
@@ -497,12 +531,12 @@ static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst,
 static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst,
                                     unsigned char *_aux) {
   unsigned char *tmp;
-  int            c_w;
-  int            c_h;
-  int            c_sz;
-  int            dst_c_h;
-  int            dst_c_sz;
-  int            pli;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_h;
+  int dst_c_sz;
+  int pli;
   /*Skip past the luma data.*/
   _dst += _y4m->pic_w * _y4m->pic_h;
   /*Compute the size of each chroma plane.*/
@@ -569,16 +603,16 @@ static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst,
 static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
                                     unsigned char *_aux) {
   unsigned char *tmp;
-  int            c_w;
-  int            c_h;
-  int            c_sz;
-  int            dst_c_w;
-  int            dst_c_h;
-  int            dst_c_sz;
-  int            tmp_sz;
-  int            pli;
-  int            y;
-  int            x;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int tmp_sz;
+  int pli;
+  int y;
+  int x;
   /*Skip past the luma data.*/
   _dst += _y4m->pic_w * _y4m->pic_h;
   /*Compute the size of each chroma plane.*/
@@ -599,23 +633,42 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
       /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a
          4-tap Mitchell window.*/
       for (x = 0; x < OC_MINI(c_w, 1); x++) {
-        tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (111 * _aux[0]
-                                                   + 18 * _aux[OC_MINI(1, c_w - 1)] - _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255);
-        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (47 * _aux[0]
-                                                       + 86 * _aux[OC_MINI(1, c_w - 1)] - 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255);
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(
+            0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+                _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                   7,
+            255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+            0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+                5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                   7,
+            255);
       }
       for (; x < c_w - 2; x++) {
-        tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x]
-                                                   + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> 7, 255);
-        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (-3 * _aux[x - 1] + 50 * _aux[x]
-                                                       + 86 * _aux[x + 1] - 5 * _aux[x + 2] + 64) >> 7, 255);
+        tmp[x << 1] =
+            (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] +
+                                         18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+                                            7,
+                                     255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+            0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+                5 * _aux[x + 2] + 64) >>
+                   7,
+            255);
       }
       for (; x < c_w; x++) {
-        tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x]
-                                                   + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> 7, 255);
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(
+            0, (_aux[x - 1] + 110 * _aux[x] +
+                18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >>
+                   7,
+            255);
         if ((x << 1 | 1) < dst_c_w) {
-          tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (-3 * _aux[x - 1] + 50 * _aux[x]
-                                                         + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> 7, 255);
+          tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+              0,
+              (-3 * _aux[x - 1] + 50 * _aux[x] +
+               86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >>
+                  7,
+              255);
         }
       }
       tmp += dst_c_w;
@@ -632,16 +685,16 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
 static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
                                     unsigned char *_aux) {
   unsigned char *tmp;
-  int            c_w;
-  int            c_h;
-  int            c_sz;
-  int            dst_c_w;
-  int            dst_c_h;
-  int            dst_c_sz;
-  int            tmp_sz;
-  int            pli;
-  int            y;
-  int            x;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int tmp_sz;
+  int pli;
+  int y;
+  int x;
   /*Skip past the luma data.*/
   _dst += _y4m->pic_w * _y4m->pic_h;
   /*Compute the size of each chroma plane.*/
@@ -657,18 +710,27 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
     /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
     for (y = 0; y < c_h; y++) {
       for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)]
-                                    - 17 * _aux[OC_MINI(2, c_w - 1)]
-                                    + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> 7, 255);
+        tmp[x >> 1] =
+            OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+                          17 * _aux[OC_MINI(2, c_w - 1)] +
+                          3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+                             7,
+                      255);
       }
       for (; x < c_w - 3; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3])
-                                    - 17 * (_aux[x - 1] + _aux[x + 2]) + 78 * (_aux[x] + _aux[x + 1]) + 64) >> 7, 255);
+        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) -
+                                    17 * (_aux[x - 1] + _aux[x + 2]) +
+                                    78 * (_aux[x] + _aux[x + 1]) + 64) >>
+                                       7,
+                                255);
       }
       for (; x < c_w; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[c_w - 1]) -
-                                    17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
-                                    78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> 7, 255);
+        tmp[x >> 1] = OC_CLAMPI(
+            0, (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+                17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+                78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+                   7,
+            255);
       }
       tmp += dst_c_w;
       _aux += c_w;
@@ -701,9 +763,9 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
 
 int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
                    int only_420) {
-  char buffer[80] = {0};
-  int  ret;
-  int  i;
+  char buffer[80] = { 0 };
+  int ret;
+  int i;
   /*Read until newline, or 80 cols, whichever happens first.*/
   for (i = 0; i < 79; i++) {
     if (_nskip > 0) {
@@ -712,10 +774,10 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
     } else {
       if (!file_read(buffer + i, 1, _fin)) return -1;
     }
-    if (buffer[i] == '\n')break;
+    if (buffer[i] == '\n') break;
   }
   /*We skipped too much header data.*/
-  if (_nskip > 0)return -1;
+  if (_nskip > 0) return -1;
   if (i == 79) {
     fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
     return -1;
@@ -734,10 +796,12 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
     return ret;
   }
   if (_y4m->interlace == '?') {
-    fprintf(stderr, "Warning: Input video interlacing format unknown; "
+    fprintf(stderr,
+            "Warning: Input video interlacing format unknown; "
             "assuming progressive scan.\n");
   } else if (_y4m->interlace != 'p') {
-    fprintf(stderr, "Input video is interlaced; "
+    fprintf(stderr,
+            "Input video is interlaced; "
             "Only progressive scan handled.\n");
     return -1;
   }
@@ -746,9 +810,11 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
   _y4m->bit_depth = 8;
   if (strcmp(_y4m->chroma_type, "420") == 0 ||
       strcmp(_y4m->chroma_type, "420jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
-                            + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz =
+        _y4m->pic_w * _y4m->pic_h +
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
     /* Natively supported: no conversion required. */
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
     _y4m->convert = y4m_convert_null;
@@ -757,9 +823,9 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
     _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 2;
     _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) *
-                                 ((_y4m->pic_h + 1) / 2));
+    _y4m->dst_buf_read_sz =
+        2 * (_y4m->pic_w * _y4m->pic_h +
+             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
     _y4m->convert = y4m_convert_null;
@@ -775,9 +841,9 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
     _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 2;
     _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) *
-                                 ((_y4m->pic_h + 1) / 2));
+    _y4m->dst_buf_read_sz =
+        2 * (_y4m->pic_w * _y4m->pic_h +
+             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
     _y4m->convert = y4m_convert_null;
@@ -789,20 +855,23 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
       return -1;
     }
   } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
     _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-                         2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
     _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
   } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
     _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
     _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
     _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
   } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
@@ -810,7 +879,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
     _y4m->dst_c_dec_v = 2;
     _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
     _y4m->convert = y4m_convert_422jpeg_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "422") == 0) {
     _y4m->src_c_dec_h = 2;
@@ -823,16 +893,16 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
       _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
-          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz =
+          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
       _y4m->convert = y4m_convert_422_420jpeg;
     } else {
       _y4m->vpx_fmt = VPX_IMG_FMT_I422;
       _y4m->bps = 16;
       _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
       _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
-                              + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->dst_buf_read_sz =
+          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
       /*Natively supported: no conversion required.*/
       _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
       _y4m->convert = y4m_convert_null;
@@ -879,7 +949,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
     _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->aux_buf_sz =
+        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
     _y4m->convert = y4m_convert_411_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "444") == 0) {
     _y4m->src_c_dec_h = 1;
@@ -892,8 +963,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
       _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
-          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz =
+          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
       _y4m->convert = y4m_convert_444_420jpeg;
     } else {
       _y4m->vpx_fmt = VPX_IMG_FMT_I444;
@@ -972,9 +1043,10 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
   }
   /*The size of the final frame buffers is always computed from the
      destination chroma decimation type.*/
-  _y4m->dst_buf_sz = _y4m->pic_w * _y4m->pic_h
-                     + 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
-                     ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+  _y4m->dst_buf_sz =
+      _y4m->pic_w * _y4m->pic_h +
+      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
   if (_y4m->bit_depth == 8)
     _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
   else
@@ -992,11 +1064,11 @@ void y4m_input_close(y4m_input *_y4m) {
 
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
   char frame[6];
-  int  pic_sz;
-  int  c_w;
-  int  c_h;
-  int  c_sz;
-  int  bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1;
+  int pic_sz;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1;
   /*Read and skip the frame header.*/
   if (!file_read(frame, 6, _fin)) return 0;
   if (memcmp(frame, "FRAME", 5)) {
@@ -1005,8 +1077,9 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
   }
   if (frame[5] != '\n') {
     char c;
-    int  j;
-    for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) {}
+    int j;
+    for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) {
+    }
     if (j == 79) {
       fprintf(stderr, "Error parsing Y4M frame header\n");
       return -1;
diff --git a/y4minput.h b/y4minput.h
index 356cebbcf0aadabe4b86f5e7f645251aa1fa7479..9e69ceb835a8861620e7b822d415225c9538b3c9 100644
--- a/y4minput.h
+++ b/y4minput.h
@@ -14,52 +14,46 @@
 #ifndef Y4MINPUT_H_
 #define Y4MINPUT_H_
 
-# include <stdio.h>
-# include "vpx/vpx_image.h"
+#include <stdio.h>
+#include "vpx/vpx_image.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-
 typedef struct y4m_input y4m_input;
 
-
-
 /*The function used to perform chroma conversion.*/
-typedef void (*y4m_convert_func)(y4m_input *_y4m,
-                                 unsigned char *_dst, unsigned char *_src);
-
-
+typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst,
+                                 unsigned char *_src);
 
 struct y4m_input {
-  int               pic_w;
-  int               pic_h;
-  int               fps_n;
-  int               fps_d;
-  int               par_n;
-  int               par_d;
-  char              interlace;
-  int               src_c_dec_h;
-  int               src_c_dec_v;
-  int               dst_c_dec_h;
-  int               dst_c_dec_v;
-  char              chroma_type[16];
+  int pic_w;
+  int pic_h;
+  int fps_n;
+  int fps_d;
+  int par_n;
+  int par_d;
+  char interlace;
+  int src_c_dec_h;
+  int src_c_dec_v;
+  int dst_c_dec_h;
+  int dst_c_dec_v;
+  char chroma_type[16];
   /*The size of each converted frame buffer.*/
-  size_t            dst_buf_sz;
+  size_t dst_buf_sz;
   /*The amount to read directly into the converted frame buffer.*/
-  size_t            dst_buf_read_sz;
+  size_t dst_buf_read_sz;
   /*The size of the auxilliary buffer.*/
-  size_t            aux_buf_sz;
+  size_t aux_buf_sz;
   /*The amount to read into the auxilliary buffer.*/
-  size_t            aux_buf_read_sz;
-  y4m_convert_func  convert;
-  unsigned char    *dst_buf;
-  unsigned char    *aux_buf;
-  enum vpx_img_fmt  vpx_fmt;
-  int               bps;
-  unsigned int      bit_depth;
+  size_t aux_buf_read_sz;
+  y4m_convert_func convert;
+  unsigned char *dst_buf;
+  unsigned char *aux_buf;
+  enum vpx_img_fmt vpx_fmt;
+  int bps;
+  unsigned int bit_depth;
 };
 
 int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,