Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions fearless_simd/src/generated/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,10 @@ impl Simd for Avx2 {
unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
}
#[inline(always)]
fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering whether we should just spell reciprocal out. But should be fine this way!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondered the same thing, but decided to mirror e.g. f32::recip.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair!

unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
}
#[inline(always)]
fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
}
Expand Down Expand Up @@ -2182,6 +2186,10 @@ impl Simd for Avx2 {
unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
}
#[inline(always)]
fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
self.splat_f64x2(1.0) / a
}
#[inline(always)]
fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
}
Expand Down Expand Up @@ -2558,6 +2566,10 @@ impl Simd for Avx2 {
unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
}
#[inline(always)]
fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
unsafe { _mm256_rcp_ps(a.into()).simd_into(self) }
}
#[inline(always)]
fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
}
Expand Down Expand Up @@ -4990,6 +5002,10 @@ impl Simd for Avx2 {
unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
}
#[inline(always)]
fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
self.splat_f64x4(1.0) / a
}
#[inline(always)]
fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
}
Expand Down Expand Up @@ -5423,6 +5439,14 @@ impl Simd for Avx2 {
self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
}
#[inline(always)]
fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(
self.approximate_recip_f32x8(a0),
self.approximate_recip_f32x8(a1),
)
}
#[inline(always)]
fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
Expand Down Expand Up @@ -8026,6 +8050,14 @@ impl Simd for Avx2 {
self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
}
#[inline(always)]
fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(
self.approximate_recip_f64x4(a0),
self.approximate_recip_f64x4(a1),
)
}
#[inline(always)]
fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
Expand Down
40 changes: 40 additions & 0 deletions fearless_simd/src/generated/fallback.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,10 @@ impl Simd for Fallback {
.simd_into(self)
}
#[inline(always)]
fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
self.splat_f32x4(1.0) / a
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't tried it, does division work without splatting? I think for mutliplication it works at least.

}
#[inline(always)]
fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
[
f32::add(a[0usize], &b[0usize]),
Expand Down Expand Up @@ -3816,6 +3820,10 @@ impl Simd for Fallback {
[f64::sqrt(a[0usize]), f64::sqrt(a[1usize])].simd_into(self)
}
#[inline(always)]
fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
self.splat_f64x2(1.0) / a
}
#[inline(always)]
fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
[
f64::add(a[0usize], &b[0usize]),
Expand Down Expand Up @@ -4226,6 +4234,14 @@ impl Simd for Fallback {
self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
}
#[inline(always)]
fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(
self.approximate_recip_f32x4(a0),
self.approximate_recip_f32x4(a1),
)
}
#[inline(always)]
fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
Expand Down Expand Up @@ -6482,6 +6498,14 @@ impl Simd for Fallback {
self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
}
#[inline(always)]
fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(
self.approximate_recip_f64x2(a0),
self.approximate_recip_f64x2(a1),
)
}
#[inline(always)]
fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
Expand Down Expand Up @@ -6918,6 +6942,14 @@ impl Simd for Fallback {
self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
}
#[inline(always)]
fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(
self.approximate_recip_f32x8(a0),
self.approximate_recip_f32x8(a1),
)
}
#[inline(always)]
fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
Expand Down Expand Up @@ -9296,6 +9328,14 @@ impl Simd for Fallback {
self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
}
#[inline(always)]
fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(
self.approximate_recip_f64x4(a0),
self.approximate_recip_f64x4(a1),
)
}
#[inline(always)]
fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
Expand Down
40 changes: 40 additions & 0 deletions fearless_simd/src/generated/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ impl Simd for Neon {
unsafe { vsqrtq_f32(a.into()).simd_into(self) }
}
#[inline(always)]
fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe { vrecpeq_f32(a.into()).simd_into(self) }
}
#[inline(always)]
fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) }
}
Expand Down Expand Up @@ -2062,6 +2066,10 @@ impl Simd for Neon {
unsafe { vsqrtq_f64(a.into()).simd_into(self) }
}
#[inline(always)]
fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe { vrecpeq_f64(a.into()).simd_into(self) }
}
#[inline(always)]
fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) }
}
Expand Down Expand Up @@ -2455,6 +2463,14 @@ impl Simd for Neon {
self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
}
#[inline(always)]
fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(
self.approximate_recip_f32x4(a0),
self.approximate_recip_f32x4(a1),
)
}
#[inline(always)]
fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
Expand Down Expand Up @@ -5128,6 +5144,14 @@ impl Simd for Neon {
self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
}
#[inline(always)]
fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(
self.approximate_recip_f64x2(a0),
self.approximate_recip_f64x2(a1),
)
}
#[inline(always)]
fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
Expand Down Expand Up @@ -5668,6 +5692,14 @@ impl Simd for Neon {
self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
}
#[inline(always)]
fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(
self.approximate_recip_f32x8(a0),
self.approximate_recip_f32x8(a1),
)
}
#[inline(always)]
fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
Expand Down Expand Up @@ -8475,6 +8507,14 @@ impl Simd for Neon {
self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
}
#[inline(always)]
fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(
self.approximate_recip_f64x4(a0),
self.approximate_recip_f64x4(a1),
)
}
#[inline(always)]
fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
Expand Down
14 changes: 14 additions & 0 deletions fearless_simd/src/generated/simd_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ pub trait Simd:
fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
#[doc = "Subtract two vectors element-wise."]
Expand Down Expand Up @@ -901,6 +903,8 @@ pub trait Simd:
fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
#[doc = "Subtract two vectors element-wise."]
Expand Down Expand Up @@ -1046,6 +1050,8 @@ pub trait Simd:
fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
#[doc = "Subtract two vectors element-wise."]
Expand Down Expand Up @@ -1817,6 +1823,8 @@ pub trait Simd:
fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
#[doc = "Subtract two vectors element-wise."]
Expand Down Expand Up @@ -1966,6 +1974,8 @@ pub trait Simd:
fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
#[doc = "Subtract two vectors element-wise."]
Expand Down Expand Up @@ -2731,6 +2741,8 @@ pub trait Simd:
fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
#[doc = "Add two vectors element-wise."]
fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
#[doc = "Subtract two vectors element-wise."]
Expand Down Expand Up @@ -2996,6 +3008,8 @@ pub trait SimdFloat<S: Simd>:
fn abs(self) -> Self;
#[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
fn sqrt(self) -> Self;
#[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
fn approximate_recip(self) -> Self;
#[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
#[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each element is all ones if the corresponding elements are equal, and all zeroes if not."]
Expand Down
24 changes: 24 additions & 0 deletions fearless_simd/src/generated/simd_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x4<S> {
self.simd.sqrt_f32x4(self)
}
#[inline(always)]
fn approximate_recip(self) -> Self {
self.simd.approximate_recip_f32x4(self)
}
#[inline(always)]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.copysign_f32x4(self, rhs.simd_into(self.simd))
}
Expand Down Expand Up @@ -1928,6 +1932,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x2<S> {
self.simd.sqrt_f64x2(self)
}
#[inline(always)]
fn approximate_recip(self) -> Self {
self.simd.approximate_recip_f64x2(self)
}
#[inline(always)]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.copysign_f64x2(self, rhs.simd_into(self.simd))
}
Expand Down Expand Up @@ -2311,6 +2319,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x8<S> {
self.simd.sqrt_f32x8(self)
}
#[inline(always)]
fn approximate_recip(self) -> Self {
self.simd.approximate_recip_f32x8(self)
}
#[inline(always)]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.copysign_f32x8(self, rhs.simd_into(self.simd))
}
Expand Down Expand Up @@ -4182,6 +4194,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x4<S> {
self.simd.sqrt_f64x4(self)
}
#[inline(always)]
fn approximate_recip(self) -> Self {
self.simd.approximate_recip_f64x4(self)
}
#[inline(always)]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.copysign_f64x4(self, rhs.simd_into(self.simd))
}
Expand Down Expand Up @@ -4585,6 +4601,10 @@ impl<S: Simd> crate::SimdFloat<S> for f32x16<S> {
self.simd.sqrt_f32x16(self)
}
#[inline(always)]
fn approximate_recip(self) -> Self {
self.simd.approximate_recip_f32x16(self)
}
#[inline(always)]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.copysign_f32x16(self, rhs.simd_into(self.simd))
}
Expand Down Expand Up @@ -6406,6 +6426,10 @@ impl<S: Simd> crate::SimdFloat<S> for f64x8<S> {
self.simd.sqrt_f64x8(self)
}
#[inline(always)]
fn approximate_recip(self) -> Self {
self.simd.approximate_recip_f64x8(self)
}
#[inline(always)]
fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self {
self.simd.copysign_f64x8(self, rhs.simd_into(self.simd))
}
Expand Down
Loading