From 11631c9ece0c03712743a94d99aaef949e681ac6 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 6 Mar 2026 22:23:52 +0100 Subject: [PATCH 1/3] utility function to flatten tensor --- src/utilities/tensor.cpp | 16 ++++++++++++++++ src/utilities/tensor.h | 2 ++ src/utilities/tensor_container.h | 3 +++ 3 files changed, 21 insertions(+) diff --git a/src/utilities/tensor.cpp b/src/utilities/tensor.cpp index f434b68..347c2e7 100644 --- a/src/utilities/tensor.cpp +++ b/src/utilities/tensor.cpp @@ -119,6 +119,14 @@ TensorShard shard_view(const Tensor& src, int idx, int num) { return TensorShard{shard, idx, num, src.Sizes}; } +Tensor flat_view(const Tensor& src) { + Tensor dst{src}; + dst.Sizes.fill(0); + dst.Sizes[0] = src.nelem(); + dst.Rank = 1; + return dst; +} + void visit(const std::function& func, SimpleTensorContainer& container) { auto cs = container.num_tensors(); for(std::size_t i = 0; i < cs; ++i) { @@ -168,6 +176,14 @@ GenericTensorContainer shard_empty_container(GenericTensorContainer&& c, int wor return std::move(c); } +GenericTensorContainer flattened_view(const GenericTensorContainer& c) { + std::vector flats(c.num_tensors()); + for (std::size_t i = 0; i < c.num_tensors(); ++i) { + flats.at(i) = flat_view(c.get_tensor(i)); + } + return GenericTensorContainer{flats}; +} + GenericTensorContainer shard_view(const GenericTensorContainer& c, int rank, int world) { std::vector shards(c.num_tensors()); for (std::size_t i = 0; i < c.num_tensors(); ++i) { diff --git a/src/utilities/tensor.h b/src/utilities/tensor.h index 7fd2798..200bad7 100644 --- a/src/utilities/tensor.h +++ b/src/utilities/tensor.h @@ -160,4 +160,6 @@ class TensorShard : public Tensor { }; TensorShard shard_view(const Tensor& src, int idx, int num); +Tensor flat_view(const Tensor& src); + #endif //LLMQ_SRC_UTILS_TENSOR_H diff --git a/src/utilities/tensor_container.h b/src/utilities/tensor_container.h index fa89b88..5653a4c 100644 --- a/src/utilities/tensor_container.h +++ b/src/utilities/tensor_container.h @@ -61,6 +61,9 @@ class GenericTensorContainer final : public SimpleTensorContainer { //! are `nullptr`, but sizes have been set up. GenericTensorContainer shard_empty_container(GenericTensorContainer&& c, int world); +//! Flattens all tensors is the container. +GenericTensorContainer flattened_view(const GenericTensorContainer& c); + //! Shards a non-empty tensor container. The returned container's tensors are _views_ into //! the original container's tensors. GenericTensorContainer shard_view(const GenericTensorContainer& c, int rank, int world); From b718b9af828efb08954bbc03be72b80835a254b0 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 6 Mar 2026 22:24:37 +0100 Subject: [PATCH 2/3] make FP8-M work also if first dim is not divisible by 512 --- src/training/adamw_optimizer.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/training/adamw_optimizer.cpp b/src/training/adamw_optimizer.cpp index 99889e7..1b672e1 100644 --- a/src/training/adamw_optimizer.cpp +++ b/src/training/adamw_optimizer.cpp @@ -177,17 +177,27 @@ void AdamWStateManager::allocate_state(IModel& model, cudaStream_t stream, EAllo } mBlocksMScales.resize(mConfig.NumLayers); + if(mMType == ETensorDType::FP8_E4M3) { + auto prepare_shape_for_scales = [&](auto&& c) { + // creates shards same as main weight + auto sharded = shard_empty_container(flattened_view(c), mWorld); + // flatten the local shard + auto flattened = flattened_view(sharded); + // and group into scaling groups + auto grouped = shard_empty_container(std::move(flattened), 128); + return grouped; + }; // we "shard" for 128 as many GPUs, so that we get 1 scale per 128 weights. for (int i = 0; i < mConfig.NumLayers; ++i) { - mBlocksMScales[i] = shard_empty_container(model.create_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32), 128 * mWorld); + mBlocksMScales[i] = prepare_shape_for_scales(model.create_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32)); alloc_lazy.allocate(mBlocksMScales[i]); alloc_lazy.commit(alloc, EAllocationType::ON_DEVICE, "m_block_scales"); visit([stream](Tensor& t){ fill_constant(t, 1.f, t.nelem(), stream); }, mBlocksMScales[i]); } - mNonBlockMScales = shard_empty_container(model.create_non_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32), 128 * mWorld); + mNonBlockMScales = prepare_shape_for_scales(model.create_non_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32)); alloc_lazy.allocate(mNonBlockMScales); alloc_lazy.commit(alloc, EAllocationType::ON_DEVICE, "m_nonblock_scales"); visit([stream](Tensor& t){ From 8b14573b4b96fb5af09a778a62cf4b4e159c3022 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Fri, 6 Mar 2026 22:24:50 +0100 Subject: [PATCH 3/3] prevent crash when trying to fill an empty tensor --- src/kernels/fill.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/kernels/fill.cu b/src/kernels/fill.cu index 8b63e92..a6ac6aa 100644 --- a/src/kernels/fill.cu +++ b/src/kernels/fill.cu @@ -17,6 +17,8 @@ __global__ void fill_kernel(floatX* dst, floatX value, std::size_t count) { template void fill_imp(floatX* dst, floatX value, std::size_t count, cudaStream_t stream) { + if (count == 0) return; + if (dst == nullptr) throw std::invalid_argument("dst is nullptr"); fill_kernel<<(256)), 256, 0, stream>>> (dst, value, count); CUDA_CHECK(cudaGetLastError()); }