linebender · tomcur · Feb 22, 2026 · Feb 22, 2026 · LaurenzV · Feb 23, 2026
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
@@ -192,6 +192,10 @@ impl Simd for Avx2 {
         unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -2182,6 +2186,10 @@ impl Simd for Avx2 {
         unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -2558,6 +2566,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
     }
@@ -4990,6 +5002,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        self.splat_f64x4(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
     }
@@ -5423,6 +5439,14 @@ impl Simd for Avx2 {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8026,6 +8050,14 @@ impl Simd for Avx2 {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);

diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
@@ -251,6 +251,10 @@ impl Simd for Fallback {
         .simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        self.splat_f32x4(1.0) / a
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         [
             f32::add(a[0usize], &b[0usize]),
@@ -3816,6 +3820,10 @@ impl Simd for Fallback {
         [f64::sqrt(a[0usize]), f64::sqrt(a[1usize])].simd_into(self)
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        self.splat_f64x2(1.0) / a
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         [
             f64::add(a[0usize], &b[0usize]),
@@ -4226,6 +4234,14 @@ impl Simd for Fallback {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -6482,6 +6498,14 @@ impl Simd for Fallback {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -6918,6 +6942,14 @@ impl Simd for Fallback {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -9296,6 +9328,14 @@ impl Simd for Fallback {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);

diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
@@ -184,6 +184,10 @@ impl Simd for Neon {
         unsafe { vsqrtq_f32(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { vrecpeq_f32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
         unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) }
     }
@@ -2062,6 +2066,10 @@ impl Simd for Neon {
         unsafe { vsqrtq_f64(a.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { vrecpeq_f64(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) }
     }
@@ -2455,6 +2463,14 @@ impl Simd for Neon {
         self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         let (b0, b1) = self.split_f32x8(b);
@@ -5128,6 +5144,14 @@ impl Simd for Neon {
         self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         let (b0, b1) = self.split_f64x4(b);
@@ -5668,6 +5692,14 @@ impl Simd for Neon {
         self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         let (b0, b1) = self.split_f32x16(b);
@@ -8475,6 +8507,14 @@ impl Simd for Neon {
         self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         let (b0, b1) = self.split_f64x8(b);

diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
@@ -152,6 +152,8 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -901,6 +903,8 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1046,6 +1050,8 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1817,6 +1823,8 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -1966,6 +1974,8 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -2731,6 +2741,8 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Subtract two vectors element-wise."]
@@ -2996,6 +3008,8 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
     #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each element is all ones if the corresponding elements are equal, and all zeroes if not."]

diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
@@ -137,6 +137,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x4<S> {
         self.simd.sqrt_f32x4(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x4(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x4(self, rhs.simd_into(self.simd))
     }
@@ -1928,6 +1932,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x2<S> {
         self.simd.sqrt_f64x2(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x2(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x2(self, rhs.simd_into(self.simd))
     }
@@ -2311,6 +2319,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x8<S> {
         self.simd.sqrt_f32x8(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x8(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x8(self, rhs.simd_into(self.simd))
     }
@@ -4182,6 +4194,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x4<S> {
         self.simd.sqrt_f64x4(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x4(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x4(self, rhs.simd_into(self.simd))
     }
@@ -4585,6 +4601,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x16<S> {
         self.simd.sqrt_f32x16(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f32x16(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f32x16(self, rhs.simd_into(self.simd))
     }
@@ -6406,6 +6426,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x8<S> {
         self.simd.sqrt_f64x8(self)
     }
     #[inline(always)]
+    fn approximate_recip(self) -> Self {
+        self.simd.approximate_recip_f64x8(self)
+    }
+    #[inline(always)]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
         self.simd.copysign_f64x8(self, rhs.simd_into(self.simd))
     }