From 52520f7c7321524f6d9f62ce5a423551f9730e27 Mon Sep 17 00:00:00 2001
From: Thomas Churchman <thomas@churchman.nl>
Date: Sun, 22 Feb 2026 14:45:42 +0100
Subject: [PATCH 1/2] Add fast approximate reciprocal methods for float vectors

x86 and AArch64 have instructions to calculate fast approximate
reciprocals, and these can speed up some algorithms quite nicely (e.g.
sprinkling this in Vello's `flatten_simd.rs` results in -4% flattening
timings for GhostScript Tiger (actually landing that there requires a
bit of thought whether the lowered precision is acceptable of course!).

There is some detail here that this PR as-is doesn't attempt to solve.
x86's `rcp` has about 12 bits of precision, AArch64's `vrecpe` about 8
bits. AArch64 has an additional instruction however, `vrecps`, to perform a
Newton refinement step, which bumps the precision to 16 bits. That'd
look something like the following.

```rust
let x0 = vrecpeq_f32(a);
x0 * vrecpsq_f32(a, x0); // calculates x0 * (2 - x0 * a), roughly doubling the precision of the `x0` estimate
```

Then, AVX512 introduces `rcp14`, which allows calculating to 14-bit
precision with (I believe) the same performance as `rcp`, and extends
support to `f64`.

In any case, this method does the simplest thing of just exposing the
cheapest hardware estimate, similar to e.g. Highway's
`ApproximateReciprocal`.
---
 fearless_simd/src/generated/avx2.rs       | 32 ++++++++++++++++++
 fearless_simd/src/generated/fallback.rs   | 40 +++++++++++++++++++++++
 fearless_simd/src/generated/neon.rs       | 40 +++++++++++++++++++++++
 fearless_simd/src/generated/simd_trait.rs | 14 ++++++++
 fearless_simd/src/generated/simd_types.rs | 24 ++++++++++++++
 fearless_simd/src/generated/sse4_2.rs     | 40 +++++++++++++++++++++++
 fearless_simd/src/generated/wasm.rs       | 40 +++++++++++++++++++++++
 fearless_simd_gen/src/arch/neon.rs        |  5 +++
 fearless_simd_gen/src/arch/x86.rs         |  5 +++
 fearless_simd_gen/src/mk_fallback.rs      |  9 +++++
 fearless_simd_gen/src/mk_wasm.rs          | 11 +++++++
 fearless_simd_gen/src/mk_x86.rs           |  8 +++++
 fearless_simd_gen/src/ops.rs              | 10 ++++++
 fearless_simd_tests/tests/harness/mod.rs  | 30 +++++++++++++++++
 14 files changed, 308 insertions(+)
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 425d46a11..819d90a4b 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -192,6 +192,10 @@ impl Simd for Avx2 {
         unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -2182,6 +2186,10 @@ impl Simd for Avx2 {
         unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -2558,6 +2566,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -4990,6 +5002,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        self.splat_f64x4(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -5423,6 +5439,14 @@ impl Simd for Avx2 {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8026,6 +8050,14 @@ impl Simd for Avx2 {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index dcf030769..afa3830d8 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -251,6 +251,10 @@ impl Simd for Fallback {
         .simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        self.splat_f32x4(1.0) / a
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         [
             f32::add(a[0usize], &b[0usize]),
@@ -3816,6 +3820,10 @@ impl Simd for Fallback {
         [f64::sqrt(a[0usize]), f64::sqrt(a[1usize])].simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         [
             f64::add(a[0usize], &b[0usize]),
@@ -4226,6 +4234,14 @@ impl Simd for Fallback {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -6482,6 +6498,14 @@ impl Simd for Fallback {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -6918,6 +6942,14 @@ impl Simd for Fallback {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -9296,6 +9328,14 @@ impl Simd for Fallback {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index d571a637a..8ce8d3c7a 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -184,6 +184,10 @@ impl Simd for Neon {
         unsafe { vsqrtq_f32(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { vrecpeq_f32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) }
     }
@@ -2062,6 +2066,10 @@ impl Simd for Neon {
         unsafe { vsqrtq_f64(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { vrecpeq_f64(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) }
     }
@@ -2455,6 +2463,14 @@ impl Simd for Neon {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -5128,6 +5144,14 @@ impl Simd for Neon {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -5668,6 +5692,14 @@ impl Simd for Neon {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8475,6 +8507,14 @@ impl Simd for Neon {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index f16d4a6a6..1f4742f64 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -152,6 +152,8 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -901,6 +903,8 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1046,6 +1050,8 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1817,6 +1823,8 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1966,6 +1974,8 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -2731,6 +2741,8 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -2996,6 +3008,8 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
     #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each element is all ones if the corresponding elements are equal, and all zeroes if not."]
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 10b058deb..044ecbc2c 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -137,6 +137,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x4<S> {
         self.simd.sqrt_f32x4(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x4(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x4(self, rhs.simd_into(self.simd))
     }
@@ -1928,6 +1932,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x2<S> {
         self.simd.sqrt_f64x2(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x2(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x2(self, rhs.simd_into(self.simd))
     }
@@ -2311,6 +2319,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x8<S> {
         self.simd.sqrt_f32x8(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x8(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x8(self, rhs.simd_into(self.simd))
     }
@@ -4182,6 +4194,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x4<S> {
         self.simd.sqrt_f64x4(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x4(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x4(self, rhs.simd_into(self.simd))
     }
@@ -4585,6 +4601,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x16<S> {
         self.simd.sqrt_f32x16(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x16(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x16(self, rhs.simd_into(self.simd))
     }
@@ -6406,6 +6426,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x8<S> {
         self.simd.sqrt_f64x8(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x8(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x8(self, rhs.simd_into(self.simd))
     }
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index cc4efd0df..64844ac3a 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -197,6 +197,10 @@ impl Simd for Sse4_2 {
         unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -2222,6 +2226,10 @@ impl Simd for Sse4_2 {
         unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -2600,6 +2608,14 @@ impl Simd for Sse4_2 {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -5055,6 +5071,14 @@ impl Simd for Sse4_2 {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -5533,6 +5557,14 @@ impl Simd for Sse4_2 {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8128,6 +8160,14 @@ impl Simd for Sse4_2 {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 7730650f6..996960aac 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -183,6 +183,10 @@ impl Simd for WasmSimd128 {
         f32x4_sqrt(a.into()).simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        self.div_f32x4(self.splat_f32x4(1.0), a)
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         f32x4_add(a.into(), b.into()).simd_into(self)
     }
@@ -2147,6 +2151,10 @@ impl Simd for WasmSimd128 {
         f64x2_sqrt(a.into()).simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.div_f64x2(self.splat_f64x2(1.0), a)
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         f64x2_add(a.into(), b.into()).simd_into(self)
     }
@@ -2554,6 +2562,14 @@ impl Simd for WasmSimd128 {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -5007,6 +5023,14 @@ impl Simd for WasmSimd128 {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -5485,6 +5509,14 @@ impl Simd for WasmSimd128 {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8072,6 +8104,14 @@ impl Simd for WasmSimd128 {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);
diff --git a/fearless_simd_gen/src/arch/neon.rs b/fearless_simd_gen/src/arch/neon.rs
index 22b29c39f..f32a4f3e2 100644
--- a/fearless_simd_gen/src/arch/neon.rs
+++ b/fearless_simd_gen/src/arch/neon.rs
@@ -69,6 +69,11 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream
                 #sub(a.into(), c2)
             }
         }
+        "approximate_recip" => {
+            let vrecpe = simple_intrinsic("vrecpe", ty);
+            let a = &args[0];
+            quote! { #vrecpe(#a) }
+        }
         _ => unimplemented!("missing {op}"),
     }
 }
diff --git a/fearless_simd_gen/src/arch/x86.rs b/fearless_simd_gen/src/arch/x86.rs
index 210fd5c1a..43dd8b5a5 100644
--- a/fearless_simd_gen/src/arch/x86.rs
+++ b/fearless_simd_gen/src/arch/x86.rs
@@ -90,6 +90,11 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream
                     #or(#and(mask, #b), #andnot(mask, #a))
                 }
             }
+            "approximate_recip" => {
+                // f32 only; f64 handled in mk_x86.rs
+                let intrinsic = simple_intrinsic("rcp", ty);
+                quote! { #intrinsic ( #( #args ),* ) }
+            }
             "mul" => {
                 let suffix = op_suffix(ty.scalar, ty.scalar_bits, false);
                 let intrinsic = if matches!(ty.scalar, ScalarType::Int | ScalarType::Unsigned) {
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index 1a5ec5382..f15a9688c 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -149,6 +149,15 @@ impl Level for Fallback {
                 }
             }
             OpSig::Unary => {
+                if method == "approximate_recip" {
+                    let splat_op = generic_op_name("splat", vec_ty);
+                    return quote! {
+                        #method_sig {
+                            self.#splat_op(1.0) / a
+                        }
+                    };
+                }
+
                 let items = make_list(
                     (0..vec_ty.len)
                         .map(|idx| {
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 44a04199f..8f6a60c44 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -95,6 +95,17 @@ impl Level for WasmSimd128 {
                     quote! {
                         self.#sub(a, self.#trunc(a))
                     }
+                } else if matches!(method, "approximate_recip") {
+                    assert_eq!(
+                        vec_ty.scalar,
+                        ScalarType::Float,
+                        "only float supports approximate_recip"
+                    );
+                    let splat_op = generic_op_name("splat", vec_ty);
+                    let div_op = generic_op_name("div", vec_ty);
+                    quote! {
+                        self.#div_op(self.#splat_op(1.0), a)
+                    }
                 } else {
                     let expr = wasm::expr(method, vec_ty, &args);
                     quote! { #expr.simd_into(self) }
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 883bc1ffd..f4dffd368 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -309,6 +309,14 @@ impl X86 {
                     }
                 }
             }
+            "approximate_recip" if vec_ty.scalar_bits == 64 => {
+                let splat_op = generic_op_name("splat", vec_ty);
+                quote! {
+                    #method_sig {
+                        self.#splat_op(1.0) / a
+                    }
+                }
+            }
             "not" => {
                 quote! {
                     #method_sig {
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 3c6609379..4b109d1b2 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -529,6 +529,16 @@ const FLOAT_OPS: &[Op] = &[
         "Compute the square root of each element.\n\n\
         Negative elements other than `-0.0` will become NaN.",
     ),
+    Op::new(
+        "approximate_recip",
+        OpKind::VecTraitMethod,
+        OpSig::Unary,
+        "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
+         This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
+         On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
+         On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
+         The precision of this operation may change as new platform support is added.",
+    ),
     Op::new(
         "add",
         OpKind::Overloaded(CoreOpTrait::Add),
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 687efef96..b1f1d76af 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -54,6 +54,21 @@ fn sqrt_f32x4<S: Simd>(simd: S) {
     assert_eq!(*f32x4::sqrt(a), [2.0, 0.0, 1.0, f32::sqrt(2.0)]);
 }
 
+#[simd_test]
+fn approximate_recip_f32x4<S: Simd>(simd: S) {
+    let a = f32x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]);
+    let result = a.approximate_recip();
+    let expected = [1.0, -0.5, 1. / 23., 1. / 9.];
+    for i in 0..4 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 #[simd_test]
 fn div_f32x4<S: Simd>(simd: S) {
     let a = f32x4::from_slice(simd, &[4.0, 2.0, 1.0, 0.0]);
@@ -2649,6 +2664,21 @@ fn sqrt_f64x2<S: Simd>(simd: S) {
     assert_eq!(*a.sqrt(), [2.0, 3.0]);
 }
 
+#[simd_test]
+fn approximate_recip_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[1.0, 23.0]);
+    let result = a.approximate_recip();
+    let expected = [1.0, 1. / 23.];
+    for i in 0..2 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 #[simd_test]
 fn copysign_f64x2<S: Simd>(simd: S) {
     let a = f64x2::from_slice(simd, &[1.5, -2.5]);

From 6bdfe8d56acba1f5e4f0b092b781738ae4f3e97a Mon Sep 17 00:00:00 2001
From: Thomas Churchman <thomas@churchman.nl>
Date: Sun, 22 Feb 2026 15:31:44 +0100
Subject: [PATCH 2/2] Make tests consistent

---
 fearless_simd_tests/tests/harness/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index b1f1d76af..d1e904cd8 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -2666,9 +2666,9 @@ fn sqrt_f64x2<S: Simd>(simd: S) {
 
 #[simd_test]
 fn approximate_recip_f64x2<S: Simd>(simd: S) {
-    let a = f64x2::from_slice(simd, &[1.0, 23.0]);
+    let a = f64x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]);
     let result = a.approximate_recip();
-    let expected = [1.0, 1. / 23.];
+    let expected = [1.0, -0.5, 1. / 23., 1. / 9.];
     for i in 0..2 {
         let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
         assert!(