diff --git a/benches/predict.rs b/benches/predict.rs
index 8a25a2b74f0f9914221a668616cfb9e71e896f65..a15c3c2df93f16f20ad7f249779766cdfff25950 100644
--- a/benches/predict.rs
+++ b/benches/predict.rs
@@ -25,8 +25,19 @@ pub fn generate_block(rng: &mut ChaChaRng) -> (Vec<u16>, Vec<u16>, Vec<u16>) {
   (block, above_context, left_context)
 }
 
+pub fn generate_block_u8(rng: &mut ChaChaRng) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
+  let block = vec![0u8; BLOCK_SIZE.width() * BLOCK_SIZE.height()];
+  let above_context: Vec<u8> =
+    (0..BLOCK_SIZE.height()).map(|_| rng.gen()).collect();
+  let left_context: Vec<u8> =
+    (0..BLOCK_SIZE.width()).map(|_| rng.gen()).collect();
+
+  (block, above_context, left_context)
+}
+
 pub fn pred_bench(c: &mut Criterion) {
   c.bench_function("intra_dc_4x4", |b| intra_dc_4x4(b));
+  c.bench_function("intra_dc_128_4x4_u8", |b| intra_dc_128_4x4_u8(b));
   c.bench_function("intra_dc_left_4x4", |b| intra_dc_left_4x4(b));
   c.bench_function("intra_dc_top_4x4", |b| intra_dc_top_4x4(b));
   c.bench_function("intra_h_4x4", |b| intra_h_4x4(b));
@@ -54,6 +65,21 @@ pub fn intra_dc_4x4(b: &mut Bencher) {
   })
 }
 
+pub fn intra_dc_128_4x4_u8(b: &mut Bencher) {
+  let mut ra = ChaChaRng::from_seed([0; 32]);
+  let (mut block, above, left) = generate_block_u8(&mut ra);
+
+  b.iter(|| {
+    for _ in 0..MAX_ITER {
+      Block4x4::pred_dc_128(
+        &mut block,
+        BLOCK_SIZE.width(),
+        8
+      );
+    }
+  })
+}
+
 pub fn intra_dc_left_4x4(b: &mut Bencher) {
   let mut ra = ChaChaRng::from_seed([0; 32]);
   let (mut block, above, left) = generate_block(&mut ra);
diff --git a/build.rs b/build.rs
index 2928f3ac0c1710e947790b6660df3ced4496a653..c9f92f88bcead12ca2011bd085a358c27a5503b1 100644
--- a/build.rs
+++ b/build.rs
@@ -14,22 +14,27 @@ use std::fs;
 use std::path::Path;
 
 fn main() {
-    #[cfg(target_arch = "x86_64")] {
+    #[cfg(all(target_arch = "x86_64", not(windows)))] {
         use std::fs::File;
         use std::io::Write;
         let out_dir = env::var("OUT_DIR").unwrap();
         {
             let dest_path = Path::new(&out_dir).join("config.asm");
             let mut config_file = File::create(dest_path).unwrap();
+            config_file.write(b"	%define private_prefix rav1e\n").unwrap();
             config_file.write(b"	%define ARCH_X86_32 0\n").unwrap();
             config_file.write(b" %define ARCH_X86_64 1\n").unwrap();
             config_file.write(b"	%define PIC 1\n").unwrap();
             config_file.write(b" %define STACK_ALIGNMENT 32\n").unwrap();
+            if cfg!(target_os="macos") {
+              config_file.write(b" %define PREFIX 1\n").unwrap();
+            }
         }
         let mut config_include_arg = String::from("-I");
         config_include_arg.push_str(&out_dir);
         config_include_arg.push('/');
-        nasm_rs::compile_library_args("rav1easm", &["src/x86/mc.asm"], &[&config_include_arg, "-Isrc/"]);
+        nasm_rs::compile_library_args("rav1easm", &["src/x86/ipred.asm"], &[&config_include_arg, "-Isrc/"]);
+        println!("cargo:rustc-link-lib=static=rav1easm");
     }
 
     if cfg!(windows) && cfg!(feature = "decode_test") {
diff --git a/src/predict.rs b/src/predict.rs
index f9971e49fc140482e4cbd7e561e5855d930c4b68..65b8f747eb84db77af8967c250909e526399774d 100644
--- a/src/predict.rs
+++ b/src/predict.rs
@@ -11,7 +11,6 @@
 #![cfg_attr(feature = "cargo-clippy", allow(cast_lossless))]
 #![cfg_attr(feature = "cargo-clippy", allow(needless_range_loop))]
 
-#[cfg(test)]
 use libc;
 use num_traits::*;
 
@@ -217,6 +216,14 @@ fn get_scaled_luma_q0(alpha_q3: i16, ac_pred_q3: i16) -> i32 {
   }
 }
 
+#[cfg(all(target_arch = "x86_64", not(windows)))]
+extern {
+  fn rav1e_ipred_dc_128_avx2(
+    dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8,
+    width: libc::c_int, height: libc::c_int, angle: libc::c_int
+  );
+}
+
 // TODO: rename the type bounds later
 pub trait Intra<T>: Dim
 where
@@ -241,6 +248,22 @@ where
 
   #[cfg_attr(feature = "comparative_bench", inline(never))]
   fn pred_dc_128(output: &mut [T], stride: usize, bit_depth: usize) {
+    #[cfg(all(target_arch = "x86_64", not(windows)))]
+    {
+      use std::ptr;
+      if size_of::<T>() == 1 && is_x86_feature_detected!("avx2") {
+        return unsafe {
+          rav1e_ipred_dc_128_avx2(
+            output.as_mut_ptr() as *mut _,
+            stride as libc::ptrdiff_t,
+            ptr::null(),
+            Self::W as libc::c_int,
+            Self::H as libc::c_int,
+            0
+          )
+        };
+      }
+    }
     for y in 0..Self::H {
       for x in 0..Self::W {
         output[y * stride + x] = (128u32 << (bit_depth - 8)).as_();
@@ -874,6 +897,18 @@ pub mod test {
     }
   }
 
+  #[test]
+  fn pred_matches_u8() {
+    let row128 = [128u8; 32];
+    let mut o = vec![0u8; 32 * 32];
+
+    Block4x4::pred_dc_128(&mut o, 32, 8);
+
+    for l in o.chunks(32).take(4) {
+      assert_eq!(l[..4], row128[..4]);
+    }
+  }
+
   #[test]
   fn pred_same() {
     let mut ra = ChaChaRng::from_seed([0; 32]);