diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 425d46a11..819d90a4b 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -192,6 +192,10 @@ impl Simd for Avx2 {
         unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -2182,6 +2186,10 @@ impl Simd for Avx2 {
         unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -2558,6 +2566,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -4990,6 +5002,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        self.splat_f64x4(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -5423,6 +5439,14 @@ impl Simd for Avx2 {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8026,6 +8050,14 @@ impl Simd for Avx2 {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index dcf030769..afa3830d8 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -251,6 +251,10 @@ impl Simd for Fallback {
         .simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        self.splat_f32x4(1.0) / a
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         [
             f32::add(a[0usize], &b[0usize]),
@@ -3816,6 +3820,10 @@ impl Simd for Fallback {
         [f64::sqrt(a[0usize]), f64::sqrt(a[1usize])].simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         [
             f64::add(a[0usize], &b[0usize]),
@@ -4226,6 +4234,14 @@ impl Simd for Fallback {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -6482,6 +6498,14 @@ impl Simd for Fallback {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -6918,6 +6942,14 @@ impl Simd for Fallback {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -9296,6 +9328,14 @@ impl Simd for Fallback {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index d571a637a..8ce8d3c7a 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -184,6 +184,10 @@ impl Simd for Neon {
         unsafe { vsqrtq_f32(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { vrecpeq_f32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) }
     }
@@ -2062,6 +2066,10 @@ impl Simd for Neon {
         unsafe { vsqrtq_f64(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { vrecpeq_f64(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) }
     }
@@ -2455,6 +2463,14 @@ impl Simd for Neon {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -5128,6 +5144,14 @@ impl Simd for Neon {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -5668,6 +5692,14 @@ impl Simd for Neon {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8475,6 +8507,14 @@ impl Simd for Neon {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index f16d4a6a6..1f4742f64 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -152,6 +152,8 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -901,6 +903,8 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1046,6 +1050,8 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1817,6 +1823,8 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1966,6 +1974,8 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -2731,6 +2741,8 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -2996,6 +3008,8 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
     #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each element is all ones if the corresponding elements are equal, and all zeroes if not."]
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 10b058deb..044ecbc2c 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -137,6 +137,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x4<S> {
         self.simd.sqrt_f32x4(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x4(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x4(self, rhs.simd_into(self.simd))
     }
@@ -1928,6 +1932,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x2<S> {
         self.simd.sqrt_f64x2(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x2(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x2(self, rhs.simd_into(self.simd))
     }
@@ -2311,6 +2319,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x8<S> {
         self.simd.sqrt_f32x8(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x8(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x8(self, rhs.simd_into(self.simd))
     }
@@ -4182,6 +4194,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x4<S> {
         self.simd.sqrt_f64x4(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x4(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x4(self, rhs.simd_into(self.simd))
     }
@@ -4585,6 +4601,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x16<S> {
         self.simd.sqrt_f32x16(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x16(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x16(self, rhs.simd_into(self.simd))
     }
@@ -6406,6 +6426,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x8<S> {
         self.simd.sqrt_f64x8(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x8(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x8(self, rhs.simd_into(self.simd))
     }
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index cc4efd0df..64844ac3a 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -197,6 +197,10 @@ impl Simd for Sse4_2 {
         unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -2222,6 +2226,10 @@ impl Simd for Sse4_2 {
         unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -2600,6 +2608,14 @@ impl Simd for Sse4_2 {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -5055,6 +5071,14 @@ impl Simd for Sse4_2 {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -5533,6 +5557,14 @@ impl Simd for Sse4_2 {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8128,6 +8160,14 @@ impl Simd for Sse4_2 {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 7730650f6..996960aac 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -183,6 +183,10 @@ impl Simd for WasmSimd128 {
         f32x4_sqrt(a.into()).simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        self.div_f32x4(self.splat_f32x4(1.0), a)
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         f32x4_add(a.into(), b.into()).simd_into(self)
     }
@@ -2147,6 +2151,10 @@ impl Simd for WasmSimd128 {
         f64x2_sqrt(a.into()).simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.div_f64x2(self.splat_f64x2(1.0), a)
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         f64x2_add(a.into(), b.into()).simd_into(self)
     }
@@ -2554,6 +2562,14 @@ impl Simd for WasmSimd128 {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -5007,6 +5023,14 @@ impl Simd for WasmSimd128 {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -5485,6 +5509,14 @@ impl Simd for WasmSimd128 {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8072,6 +8104,14 @@ impl Simd for WasmSimd128 {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd_gen/src/arch/neon.rs b/fearless_simd_gen/src/arch/neon.rs
index 22b29c39f..f32a4f3e2 100644
--- a/fearless_simd_gen/src/arch/neon.rs
+++ b/fearless_simd_gen/src/arch/neon.rs
@@ -69,6 +69,11 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream
                 #sub(a.into(), c2)
             }
         }
+        "approximate_recip" => {
+            let vrecpe = simple_intrinsic("vrecpe", ty);
+            let a = &args[0];
+            quote! { #vrecpe(#a) }
+        }
         _ => unimplemented!("missing {op}"),
     }
 }
diff --git a/fearless_simd_gen/src/arch/x86.rs b/fearless_simd_gen/src/arch/x86.rs
index 210fd5c1a..43dd8b5a5 100644
--- a/fearless_simd_gen/src/arch/x86.rs
+++ b/fearless_simd_gen/src/arch/x86.rs
@@ -90,6 +90,11 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream
                     #or(#and(mask, #b), #andnot(mask, #a))
                 }
             }
+            "approximate_recip" => {
+                // f32 only; f64 handled in mk_x86.rs
+                let intrinsic = simple_intrinsic("rcp", ty);
+                quote! { #intrinsic ( #( #args ),* ) }
+            }
             "mul" => {
                 let suffix = op_suffix(ty.scalar, ty.scalar_bits, false);
                 let intrinsic = if matches!(ty.scalar, ScalarType::Int | ScalarType::Unsigned) {
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index 1a5ec5382..f15a9688c 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -149,6 +149,15 @@ impl Level for Fallback {
                 }
             }
             OpSig::Unary => {
+                if method == "approximate_recip" {
+                    let splat_op = generic_op_name("splat", vec_ty);
+                    return quote! {
+                        #method_sig {
+                            self.#splat_op(1.0) / a
+                        }
+                    };
+                }
+
                 let items = make_list(
                     (0..vec_ty.len)
                         .map(|idx| {
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 44a04199f..8f6a60c44 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -95,6 +95,17 @@ impl Level for WasmSimd128 {
                     quote! {
                         self.#sub(a, self.#trunc(a))
                     }
+                } else if matches!(method, "approximate_recip") {
+                    assert_eq!(
+                        vec_ty.scalar,
+                        ScalarType::Float,
+                        "only float supports approximate_recip"
+                    );
+                    let splat_op = generic_op_name("splat", vec_ty);
+                    let div_op = generic_op_name("div", vec_ty);
+                    quote! {
+                        self.#div_op(self.#splat_op(1.0), a)
+                    }
                 } else {
                     let expr = wasm::expr(method, vec_ty, &args);
                     quote! { #expr.simd_into(self) }
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 883bc1ffd..f4dffd368 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -309,6 +309,14 @@ impl X86 {
                     }
                 }
             }
+            "approximate_recip" if vec_ty.scalar_bits == 64 => {
+                let splat_op = generic_op_name("splat", vec_ty);
+                quote! {
+                    #method_sig {
+                        self.#splat_op(1.0) / a
+                    }
+                }
+            }
             "not" => {
                 quote! {
                     #method_sig {
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 3c6609379..4b109d1b2 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -529,6 +529,16 @@ const FLOAT_OPS: &[Op] = &[
         "Compute the square root of each element.\n\n\
         Negative elements other than `-0.0` will become NaN.",
     ),
+    Op::new(
+        "approximate_recip",
+        OpKind::VecTraitMethod,
+        OpSig::Unary,
+        "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
+         This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
+         On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
+         On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
+         The precision of this operation may change as new platform support is added.",
+    ),
     Op::new(
         "add",
         OpKind::Overloaded(CoreOpTrait::Add),
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 687efef96..d1e904cd8 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -54,6 +54,21 @@ fn sqrt_f32x4<S: Simd>(simd: S) {
     assert_eq!(*f32x4::sqrt(a), [2.0, 0.0, 1.0, f32::sqrt(2.0)]);
 }
 
+#[simd_test]
+fn approximate_recip_f32x4<S: Simd>(simd: S) {
+    let a = f32x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]);
+    let result = a.approximate_recip();
+    let expected = [1.0, -0.5, 1. / 23., 1. / 9.];
+    for i in 0..4 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 #[simd_test]
 fn div_f32x4<S: Simd>(simd: S) {
     let a = f32x4::from_slice(simd, &[4.0, 2.0, 1.0, 0.0]);
@@ -2649,6 +2664,21 @@ fn sqrt_f64x2<S: Simd>(simd: S) {
     assert_eq!(*a.sqrt(), [2.0, 3.0]);
 }
 
+#[simd_test]
+fn approximate_recip_f64x2<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]);
+    let result = a.approximate_recip();
+    let expected = [1.0, -0.5, 1. / 23., 1. / 9.];
+    for i in 0..2 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 #[simd_test]
 fn copysign_f64x2<S: Simd>(simd: S) {
     let a = f64x2::from_slice(simd, &[1.5, -2.5]);