diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 425d46a11..819d90a4b 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -192,6 +192,10 @@ impl Simd for Avx2 { unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + } + #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } } @@ -2182,6 +2186,10 @@ impl Simd for Avx2 { unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { + self.splat_f64x2(1.0) / a + } + #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } } @@ -2558,6 +2566,10 @@ impl Simd for Avx2 { unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + unsafe { _mm256_rcp_ps(a.into()).simd_into(self) } + } + #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) } } @@ -4990,6 +5002,10 @@ impl Simd for Avx2 { unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + self.splat_f64x4(1.0) / a + } + #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) } } @@ -5423,6 +5439,14 @@ impl Simd for Avx2 { self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); @@ -8026,6 +8050,14 @@ impl Simd for Avx2 { self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) + } + #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index dcf030769..afa3830d8 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -251,6 +251,10 @@ impl Simd for Fallback { .simd_into(self) } #[inline(always)] + fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { + self.splat_f32x4(1.0) / a + } + #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::add(a[0usize], &b[0usize]), @@ -3816,6 +3820,10 @@ impl Simd for Fallback { [f64::sqrt(a[0usize]), f64::sqrt(a[1usize])].simd_into(self) } #[inline(always)] + fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { + self.splat_f64x2(1.0) / a + } + #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::add(a[0usize], &b[0usize]), @@ -4226,6 +4234,14 @@ impl Simd for Fallback { self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.approximate_recip_f32x4(a0), + self.approximate_recip_f32x4(a1), + ) + } + #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); @@ -6482,6 +6498,14 @@ impl Simd for Fallback { self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.approximate_recip_f64x2(a0), + self.approximate_recip_f64x2(a1), + ) + } + #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); @@ -6918,6 +6942,14 @@ impl Simd for Fallback { self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); @@ -9296,6 +9328,14 @@ impl Simd for Fallback { self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) + } + #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index d571a637a..8ce8d3c7a 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -184,6 +184,10 @@ impl Simd for Neon { unsafe { vsqrtq_f32(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { + unsafe { vrecpeq_f32(a.into()).simd_into(self) } + } + #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) } } @@ -2062,6 +2066,10 @@ impl Simd for Neon { unsafe { vsqrtq_f64(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { + unsafe { vrecpeq_f64(a.into()).simd_into(self) } + } + #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) } } @@ -2455,6 +2463,14 @@ impl Simd for Neon { self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.approximate_recip_f32x4(a0), + self.approximate_recip_f32x4(a1), + ) + } + #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); @@ -5128,6 +5144,14 @@ impl Simd for Neon { self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.approximate_recip_f64x2(a0), + self.approximate_recip_f64x2(a1), + ) + } + #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); @@ -5668,6 +5692,14 @@ impl Simd for Neon { self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); @@ -8475,6 +8507,14 @@ impl Simd for Neon { self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) + } + #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index f16d4a6a6..1f4742f64 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -152,6 +152,8 @@ pub trait Simd: fn neg_f32x4(self, a: f32x4) -> f32x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x4(self, a: f32x4) -> f32x4; + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + fn approximate_recip_f32x4(self, a: f32x4) -> f32x4; #[doc = "Add two vectors element-wise."] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4; #[doc = "Subtract two vectors element-wise."] @@ -901,6 +903,8 @@ pub trait Simd: fn neg_f64x2(self, a: f64x2) -> f64x2; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x2(self, a: f64x2) -> f64x2; + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + fn approximate_recip_f64x2(self, a: f64x2) -> f64x2; #[doc = "Add two vectors element-wise."] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2; #[doc = "Subtract two vectors element-wise."] @@ -1046,6 +1050,8 @@ pub trait Simd: fn neg_f32x8(self, a: f32x8) -> f32x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x8(self, a: f32x8) -> f32x8; + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8; #[doc = "Add two vectors element-wise."] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8; #[doc = "Subtract two vectors element-wise."] @@ -1817,6 +1823,8 @@ pub trait Simd: fn neg_f64x4(self, a: f64x4) -> f64x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x4(self, a: f64x4) -> f64x4; + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4; #[doc = "Add two vectors element-wise."] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4; #[doc = "Subtract two vectors element-wise."] @@ -1966,6 +1974,8 @@ pub trait Simd: fn neg_f32x16(self, a: f32x16) -> f32x16; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x16(self, a: f32x16) -> f32x16; + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16; #[doc = "Add two vectors element-wise."] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16; #[doc = "Subtract two vectors element-wise."] @@ -2731,6 +2741,8 @@ pub trait Simd: fn neg_f64x8(self, a: f64x8) -> f64x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x8(self, a: f64x8) -> f64x8; + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8; #[doc = "Add two vectors element-wise."] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8; #[doc = "Subtract two vectors element-wise."] @@ -2996,6 +3008,8 @@ pub trait SimdFloat: fn abs(self) -> Self; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt(self) -> Self; + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + fn approximate_recip(self) -> Self; #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."] fn copysign(self, rhs: impl SimdInto) -> Self; #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each element is all ones if the corresponding elements are equal, and all zeroes if not."] diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index 10b058deb..044ecbc2c 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -137,6 +137,10 @@ impl crate::SimdFloat for f32x4 { self.simd.sqrt_f32x4(self) } #[inline(always)] + fn approximate_recip(self) -> Self { + self.simd.approximate_recip_f32x4(self) + } + #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> Self { self.simd.copysign_f32x4(self, rhs.simd_into(self.simd)) } @@ -1928,6 +1932,10 @@ impl crate::SimdFloat for f64x2 { self.simd.sqrt_f64x2(self) } #[inline(always)] + fn approximate_recip(self) -> Self { + self.simd.approximate_recip_f64x2(self) + } + #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> Self { self.simd.copysign_f64x2(self, rhs.simd_into(self.simd)) } @@ -2311,6 +2319,10 @@ impl crate::SimdFloat for f32x8 { self.simd.sqrt_f32x8(self) } #[inline(always)] + fn approximate_recip(self) -> Self { + self.simd.approximate_recip_f32x8(self) + } + #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> Self { self.simd.copysign_f32x8(self, rhs.simd_into(self.simd)) } @@ -4182,6 +4194,10 @@ impl crate::SimdFloat for f64x4 { self.simd.sqrt_f64x4(self) } #[inline(always)] + fn approximate_recip(self) -> Self { + self.simd.approximate_recip_f64x4(self) + } + #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> Self { self.simd.copysign_f64x4(self, rhs.simd_into(self.simd)) } @@ -4585,6 +4601,10 @@ impl crate::SimdFloat for f32x16 { self.simd.sqrt_f32x16(self) } #[inline(always)] + fn approximate_recip(self) -> Self { + self.simd.approximate_recip_f32x16(self) + } + #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> Self { self.simd.copysign_f32x16(self, rhs.simd_into(self.simd)) } @@ -6406,6 +6426,10 @@ impl crate::SimdFloat for f64x8 { self.simd.sqrt_f64x8(self) } #[inline(always)] + fn approximate_recip(self) -> Self { + self.simd.approximate_recip_f64x8(self) + } + #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> Self { self.simd.copysign_f64x8(self, rhs.simd_into(self.simd)) } diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index cc4efd0df..64844ac3a 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -197,6 +197,10 @@ impl Simd for Sse4_2 { unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { + unsafe { _mm_rcp_ps(a.into()).simd_into(self) } + } + #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } } @@ -2222,6 +2226,10 @@ impl Simd for Sse4_2 { unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } } #[inline(always)] + fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { + self.splat_f64x2(1.0) / a + } + #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } } @@ -2600,6 +2608,14 @@ impl Simd for Sse4_2 { self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.approximate_recip_f32x4(a0), + self.approximate_recip_f32x4(a1), + ) + } + #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); @@ -5055,6 +5071,14 @@ impl Simd for Sse4_2 { self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.approximate_recip_f64x2(a0), + self.approximate_recip_f64x2(a1), + ) + } + #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); @@ -5533,6 +5557,14 @@ impl Simd for Sse4_2 { self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); @@ -8128,6 +8160,14 @@ impl Simd for Sse4_2 { self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) + } + #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 7730650f6..996960aac 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -183,6 +183,10 @@ impl Simd for WasmSimd128 { f32x4_sqrt(a.into()).simd_into(self) } #[inline(always)] + fn approximate_recip_f32x4(self, a: f32x4) -> f32x4 { + self.div_f32x4(self.splat_f32x4(1.0), a) + } + #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_add(a.into(), b.into()).simd_into(self) } @@ -2147,6 +2151,10 @@ impl Simd for WasmSimd128 { f64x2_sqrt(a.into()).simd_into(self) } #[inline(always)] + fn approximate_recip_f64x2(self, a: f64x2) -> f64x2 { + self.div_f64x2(self.splat_f64x2(1.0), a) + } + #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_add(a.into(), b.into()).simd_into(self) } @@ -2554,6 +2562,14 @@ impl Simd for WasmSimd128 { self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] + fn approximate_recip_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4( + self.approximate_recip_f32x4(a0), + self.approximate_recip_f32x4(a1), + ) + } + #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); @@ -5007,6 +5023,14 @@ impl Simd for WasmSimd128 { self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] + fn approximate_recip_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2( + self.approximate_recip_f64x2(a0), + self.approximate_recip_f64x2(a1), + ) + } + #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); @@ -5485,6 +5509,14 @@ impl Simd for WasmSimd128 { self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] + fn approximate_recip_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8( + self.approximate_recip_f32x8(a0), + self.approximate_recip_f32x8(a1), + ) + } + #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); @@ -8072,6 +8104,14 @@ impl Simd for WasmSimd128 { self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] + fn approximate_recip_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4( + self.approximate_recip_f64x4(a0), + self.approximate_recip_f64x4(a1), + ) + } + #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); diff --git a/fearless_simd_gen/src/arch/neon.rs b/fearless_simd_gen/src/arch/neon.rs index 22b29c39f..f32a4f3e2 100644 --- a/fearless_simd_gen/src/arch/neon.rs +++ b/fearless_simd_gen/src/arch/neon.rs @@ -69,6 +69,11 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream #sub(a.into(), c2) } } + "approximate_recip" => { + let vrecpe = simple_intrinsic("vrecpe", ty); + let a = &args[0]; + quote! { #vrecpe(#a) } + } _ => unimplemented!("missing {op}"), } } diff --git a/fearless_simd_gen/src/arch/x86.rs b/fearless_simd_gen/src/arch/x86.rs index 210fd5c1a..43dd8b5a5 100644 --- a/fearless_simd_gen/src/arch/x86.rs +++ b/fearless_simd_gen/src/arch/x86.rs @@ -90,6 +90,11 @@ pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream #or(#and(mask, #b), #andnot(mask, #a)) } } + "approximate_recip" => { + // f32 only; f64 handled in mk_x86.rs + let intrinsic = simple_intrinsic("rcp", ty); + quote! { #intrinsic ( #( #args ),* ) } + } "mul" => { let suffix = op_suffix(ty.scalar, ty.scalar_bits, false); let intrinsic = if matches!(ty.scalar, ScalarType::Int | ScalarType::Unsigned) { diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index 1a5ec5382..f15a9688c 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -149,6 +149,15 @@ impl Level for Fallback { } } OpSig::Unary => { + if method == "approximate_recip" { + let splat_op = generic_op_name("splat", vec_ty); + return quote! { + #method_sig { + self.#splat_op(1.0) / a + } + }; + } + let items = make_list( (0..vec_ty.len) .map(|idx| { diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 44a04199f..8f6a60c44 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -95,6 +95,17 @@ impl Level for WasmSimd128 { quote! { self.#sub(a, self.#trunc(a)) } + } else if matches!(method, "approximate_recip") { + assert_eq!( + vec_ty.scalar, + ScalarType::Float, + "only float supports approximate_recip" + ); + let splat_op = generic_op_name("splat", vec_ty); + let div_op = generic_op_name("div", vec_ty); + quote! { + self.#div_op(self.#splat_op(1.0), a) + } } else { let expr = wasm::expr(method, vec_ty, &args); quote! { #expr.simd_into(self) } diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 883bc1ffd..f4dffd368 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -309,6 +309,14 @@ impl X86 { } } } + "approximate_recip" if vec_ty.scalar_bits == 64 => { + let splat_op = generic_op_name("splat", vec_ty); + quote! { + #method_sig { + self.#splat_op(1.0) / a + } + } + } "not" => { quote! { #method_sig { diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index 3c6609379..4b109d1b2 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -529,6 +529,16 @@ const FLOAT_OPS: &[Op] = &[ "Compute the square root of each element.\n\n\ Negative elements other than `-0.0` will become NaN.", ), + Op::new( + "approximate_recip", + OpKind::VecTraitMethod, + OpSig::Unary, + "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\ + This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\ + On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \ + On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \ + The precision of this operation may change as new platform support is added.", + ), Op::new( "add", OpKind::Overloaded(CoreOpTrait::Add), diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 687efef96..d1e904cd8 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -54,6 +54,21 @@ fn sqrt_f32x4(simd: S) { assert_eq!(*f32x4::sqrt(a), [2.0, 0.0, 1.0, f32::sqrt(2.0)]); } +#[simd_test] +fn approximate_recip_f32x4(simd: S) { + let a = f32x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]); + let result = a.approximate_recip(); + let expected = [1.0, -0.5, 1. / 23., 1. / 9.]; + for i in 0..4 { + let rel_error = ((result[i] - expected[i]) / expected[i]).abs(); + assert!( + rel_error < 0.005, + "approximate_recip({}) rel_error = {rel_error}", + a[i] + ); + } +} + #[simd_test] fn div_f32x4(simd: S) { let a = f32x4::from_slice(simd, &[4.0, 2.0, 1.0, 0.0]); @@ -2649,6 +2664,21 @@ fn sqrt_f64x2(simd: S) { assert_eq!(*a.sqrt(), [2.0, 3.0]); } +#[simd_test] +fn approximate_recip_f64x2(simd: S) { + let a = f64x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]); + let result = a.approximate_recip(); + let expected = [1.0, -0.5, 1. / 23., 1. / 9.]; + for i in 0..2 { + let rel_error = ((result[i] - expected[i]) / expected[i]).abs(); + assert!( + rel_error < 0.005, + "approximate_recip({}) rel_error = {rel_error}", + a[i] + ); + } +} + #[simd_test] fn copysign_f64x2(simd: S) { let a = f64x2::from_slice(simd, &[1.5, -2.5]);