diff --git a/rtiow/renderer/Cargo.toml b/rtiow/renderer/Cargo.toml
index a9a0da2..b245290 100644
--- a/rtiow/renderer/Cargo.toml
+++ b/rtiow/renderer/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 [[bench]]
 harness = false
 name = "spheres"
+[[bench]]
+harness = false
+name = "aabb"
 
 [dependencies]
 chrono = "*"
diff --git a/rtiow/renderer/benches/aabb.rs b/rtiow/renderer/benches/aabb.rs
index cc4560a..9a8aa0a 100644
--- a/rtiow/renderer/benches/aabb.rs
+++ b/rtiow/renderer/benches/aabb.rs
@@ -1,16 +1,45 @@
 use criterion::*;
-
-fn decode(bytes: &[u8]) {
-    // Decode the bytes
-    //...
-}
+use renderer::{aabb::AABB, ray::Ray};
 
 fn bench(c: &mut Criterion) {
-    let bytes: &[u8] = b"some bytes";
+    let bb = AABB::new([1., -1., -1.], [3., 1., 1.]);
+    let r_hit = Ray::new([0., 0., 0.], [1., 0., 0.], 0.);
+    let r_miss = Ray::new([0., 0., 0.], [-1., 0., 0.], 0.);
+    let t_min = 0.001;
+    let t_max = f32::MAX;
+
+    let mut group = c.benchmark_group("aabb");
+    group.throughput(Throughput::Elements(1));
+    group.bench_with_input(BenchmarkId::new("hit_naive", "r_hit"), &r_hit, |b, r| {
+        b.iter(|| bb.hit_naive(*r, t_min, t_max))
+    });
+    group.bench_with_input(BenchmarkId::new("hit2", "r_hit"), &r_hit, |b, r| {
+        b.iter(|| bb.hit2(*r, t_min, t_max))
+    });
+    //group.bench_with_input(BenchmarkId::new("hit_precompute", "r_hit"), &r_hit, |b, r| { b.iter(|| bb.hit_precompute(*r, t_min, t_max)) });
+    group.bench_with_input(BenchmarkId::new("hit_fast", "r_hit"), &r_hit, |b, r| {
+        b.iter(|| bb.hit_fast(*r, t_min, t_max))
+    });
+    #[cfg(target_arch = "x86_64")]
+    group.bench_with_input(BenchmarkId::new("hit_simd", "r_hit"), &r_hit, |b, r| {
+        b.iter(|| bb.hit_simd(*r, t_min, t_max))
+    });
+
+    group.bench_with_input(BenchmarkId::new("hit_naive", "r_miss"), &r_miss, |b, r| {
+        b.iter(|| bb.hit_naive(*r, t_min, t_max))
+    });
+    group.bench_with_input(BenchmarkId::new("hit2", "r_miss"), &r_miss, |b, r| {
+        b.iter(|| bb.hit2(*r, t_min, t_max))
+    });
+    //group.bench_with_input(BenchmarkId::new("hit_precompute", "r_miss"), &r_miss, |b, r| { b.iter(|| bb.hit_precompute(*r, t_min, t_max)) });
+    group.bench_with_input(BenchmarkId::new("hit_fast", "r_miss"), &r_miss, |b, r| {
+        b.iter(|| bb.hit_fast(*r, t_min, t_max))
+    });
+    #[cfg(target_arch = "x86_64")]
+    group.bench_with_input(BenchmarkId::new("hit_simd", "r_miss"), &r_miss, |b, r| {
+        b.iter(|| bb.hit_simd(*r, t_min, t_max))
+    });
 
-    let mut group = c.benchmark_group("throughput-example");
-    group.throughput(Throughput::Bytes(bytes.len() as u64));
-    group.bench_function("decode", |b| b.iter(|| decode(bytes)));
     group.finish();
 }
 
diff --git a/rtiow/renderer/benches/spheres.rs b/rtiow/renderer/benches/spheres.rs
index b8b93c3..1b26e47 100644
--- a/rtiow/renderer/benches/spheres.rs
+++ b/rtiow/renderer/benches/spheres.rs
@@ -1,6 +1,3 @@
-#[macro_use]
-extern crate criterion;
-
 use criterion::*;
 
 use renderer::{
@@ -21,13 +18,13 @@ fn criterion_benchmark(c: &mut Criterion) {
         Ray::new([0., 0., -2.], [0., 0., -1.], 0.),
     ];
     let mut group = c.benchmark_group("sphere");
-    for r in rays {
-        group.bench_with_input(
-            BenchmarkId::new("Sphere", format!("{:?}", r)),
-            &r,
-            |b, r| b.iter(|| sphere.hit(*r, 0., 1.)),
-        );
-    }
+    group.throughput(Throughput::Elements(1));
+    group.bench_with_input(BenchmarkId::new("Sphere", "hit"), &rays[0], |b, r| {
+        b.iter(|| sphere.hit(*r, 0., 1.))
+    });
+    group.bench_with_input(BenchmarkId::new("Sphere", "miss"), &rays[1], |b, r| {
+        b.iter(|| sphere.hit(*r, 0., 1.))
+    });
     group.finish()
 }
 
diff --git a/rtiow/renderer/src/aabb.rs b/rtiow/renderer/src/aabb.rs
index 95bf2a1..0aded58 100644
--- a/rtiow/renderer/src/aabb.rs
+++ b/rtiow/renderer/src/aabb.rs
@@ -2,7 +2,7 @@ use std::fmt;
 
 use crate::{ray::Ray, vec3::Vec3};
 
-#[derive(Debug, Copy, Clone, PartialEq)]
+#[derive(Default, Debug, Copy, Clone, PartialEq)]
 pub struct AABB {
     bounds: [Vec3; 2],
 }
@@ -30,7 +30,12 @@ fn max(x: f32, y: f32) -> f32 {
 }
 
 impl AABB {
-    pub fn new(min: Vec3, max: Vec3) -> AABB {
+    pub fn new<V: Into<Vec3>>(min: V, max: V) -> AABB {
+        let min: Vec3 = min.into();
+        let max: Vec3 = max.into();
+        assert!(min.x < max.x);
+        assert!(min.y < max.y);
+        assert!(min.z < max.z);
         AABB { bounds: [min, max] }
     }
 
@@ -61,10 +66,33 @@ impl AABB {
     pub fn min(&self) -> Vec3 {
         self.bounds[0]
     }
+
     pub fn max(&self) -> Vec3 {
         self.bounds[1]
     }
 
+    pub fn hit(&self, r: Ray, t_min: f32, t_max: f32) -> bool {
+        self.hit_simd(r, t_min, t_max)
+        //self.hit_naive(r, t_min, t_max)
+    }
+
+    pub fn hit_naive(&self, r: Ray, t_min: f32, t_max: f32) -> bool {
+        let mut t_min = t_min;
+        let mut t_max = t_max;
+        for axis in 0..3 {
+            let t0 = ((self.min()[axis] - r.origin[axis]) * r.inv_direction[axis])
+                .min((self.max()[axis] - r.origin[axis]) * r.inv_direction[axis]);
+            let t1 = ((self.min()[axis] - r.origin[axis]) * r.inv_direction[axis])
+                .max((self.max()[axis] - r.origin[axis]) * r.inv_direction[axis]);
+            t_min = t0.max(t_min);
+            t_max = t1.min(t_max);
+            if t_max <= t_min {
+                return false;
+            }
+        }
+        true
+    }
+
     pub fn hit2(&self, r: Ray, t_min: f32, t_max: f32) -> bool {
         let mut t_min = t_min;
         let mut t_max = t_max;
@@ -119,21 +147,26 @@ impl AABB {
         t_min < t1 && t_max > t0
     }
 
-    pub fn hit(&self, r: Ray, t_min: f32, t_max: f32) -> bool {
-        let mut t_min = t_min;
-        let mut t_max = t_max;
-        for axis in 0..3 {
-            let t0 = ((self.min()[axis] - r.origin[axis]) * r.inv_direction[axis])
-                .min((self.max()[axis] - r.origin[axis]) * r.inv_direction[axis]);
-            let t1 = ((self.min()[axis] - r.origin[axis]) * r.inv_direction[axis])
-                .max((self.max()[axis] - r.origin[axis]) * r.inv_direction[axis]);
-            t_min = t0.max(t_min);
-            t_max = t1.min(t_max);
-            if t_max <= t_min {
-                return false;
-            }
+    pub fn hit_simd(&self, r: Ray, t_min: f32, t_max: f32) -> bool {
+        #[cfg(target_arch = "x86_64")]
+        unsafe {
+            use std::arch::x86_64::*;
+            let o4 = _mm_set_ps(0., r.origin.z, r.origin.y, r.origin.x);
+            let d4 = _mm_set_ps(0., r.direction.z, r.direction.y, r.direction.x);
+            let bmin4 = _mm_set_ps(0., self.min().z, self.min().y, self.min().x);
+            let bmax4 = _mm_set_ps(0., self.max().z, self.max().y, self.max().x);
+            let mask4 = _mm_cmpeq_ps(_mm_setzero_ps(), _mm_set_ps(1., 0., 0., 0.));
+            let t1 = _mm_mul_ps(_mm_sub_ps(_mm_and_ps(bmin4, mask4), o4), d4);
+            let t2 = _mm_mul_ps(_mm_sub_ps(_mm_and_ps(bmax4, mask4), o4), d4);
+            let vmax4 = _mm_max_ps(t1, t2);
+            let vmin4 = _mm_min_ps(t1, t2);
+            let vmax4: (f32, f32, f32, f32) = std::mem::transmute(vmax4);
+            let vmin4: (f32, f32, f32, f32) = std::mem::transmute(vmin4);
+            let tmax = min(vmax4.0, min(vmax4.1, vmax4.2));
+            let tmin = max(vmin4.0, max(vmin4.1, vmin4.2));
+            //tmax >= tmin && tmin < r.time && tmax > t_min
+            t_min <= tmin && tmin <= t_max
         }
-        true
     }
 
     pub fn hit_fast(&self, r: Ray, _t_min: f32, _t_max: f32) -> bool {
@@ -169,3 +202,48 @@ pub fn surrounding_box(box0: &AABB, box1: &AABB) -> AABB {
     );
     AABB::new(min, max)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    macro_rules! hit_test {
+        ($($name:ident,)*) => {
+            mod hit {
+                use super::*;
+                $(
+                    #[test]
+                    fn $name() {
+                        let t_min = 0.001;
+                        let t_max = f32::MAX;
+                        let bb = AABB::new([1., -1., -1.], [3., 1., 1.]);
+                        // Hit
+                        let r = Ray::new([0., 0., 0.], [1., 0., 0.], 0.5);
+                        assert!(bb.$name(r, t_min, t_max));
+                    }
+                )*
+            }
+            mod miss {
+                use super::*;
+                $(
+                    #[test]
+                    fn $name() {
+                        let t_min = 0.001;
+                        let t_max = f32::MAX;
+                        let bb = AABB::new([1., -1., -1.], [3., 1., 1.]);
+                        // Miss
+                        let r = Ray::new([0., 0., 0.], [-1., 0., 0.], 0.5);
+                        assert!(!bb.$name(r, t_min, t_max));
+                    }
+                )*
+            }
+        }
+    }
+
+    hit_test! {
+        hit_naive,
+        hit2,
+        hit_fast,
+        hit_simd,
+    }
+}
diff --git a/rtiow/renderer/src/bvh_triangles.rs b/rtiow/renderer/src/bvh_triangles.rs
index 67a598b..f7e7246 100644
--- a/rtiow/renderer/src/bvh_triangles.rs
+++ b/rtiow/renderer/src/bvh_triangles.rs
@@ -77,7 +77,7 @@ where
     fn build_bvh(&mut self) {
         // assign all triangles to root node
         let root = BVHNode {
-            aabb: AABB::new(0f32.into(), 0f32.into()),
+            aabb: AABB::default(),
             left_child: 0,
             first_prim: 0,
             prim_count: self.triangles.len() - 1,
@@ -139,13 +139,13 @@ where
         let left_child_idx = self.bvh_nodes.len();
         let right_child_idx = left_child_idx + 1;
         let left = BVHNode {
-            aabb: AABB::new(0f32.into(), 0f32.into()),
+            aabb: AABB::default(),
             left_child: 0,
             first_prim: first_prim,
             prim_count: left_count,
         };
         let right = BVHNode {
-            aabb: AABB::new(0f32.into(), 0f32.into()),
+            aabb: AABB::default(),
             left_child: 0,
             first_prim: i as usize,
             prim_count: prim_count - left_count,