IST-DASLab · ngc92 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Copilot
diff --git a/src/kernels/fill.cu b/src/kernels/fill.cu
@@ -17,6 +17,8 @@ __global__ void fill_kernel(floatX* dst, floatX value, std::size_t count) {
 
 template<typename floatX>
 void fill_imp(floatX* dst, floatX value, std::size_t count, cudaStream_t stream) {
+    if (count == 0) return;
+    if (dst == nullptr) throw std::invalid_argument("dst is nullptr");
     fill_kernel<<<div_ceil(count, static_cast<std::size_t>(256)), 256, 0, stream>>> (dst, value, count);
     CUDA_CHECK(cudaGetLastError());
 }

diff --git a/src/training/adamw_optimizer.cpp b/src/training/adamw_optimizer.cpp
@@ -177,17 +177,27 @@ void AdamWStateManager::allocate_state(IModel& model, cudaStream_t stream, EAllo
         }
 
         mBlocksMScales.resize(mConfig.NumLayers);
+
         if(mMType == ETensorDType::FP8_E4M3) {
+            auto prepare_shape_for_scales = [&](auto&& c) {
+                // creates shards same as main weight
+                auto sharded = shard_empty_container(flattened_view(c), mWorld);
+                // flatten the local shard
+                auto flattened = flattened_view(sharded);
+                // and group into scaling groups
+                auto grouped = shard_empty_container(std::move(flattened), 128);
-                // flatten the local shard
-                auto flattened = flattened_view(sharded);
-                // and group into scaling groups
-                auto grouped = shard_empty_container(std::move(flattened), 128);
+                // tensors in 'sharded' are already 1D; directly group into scaling groups
+                auto grouped = shard_empty_container(std::move(sharded), 128);
-                // flatten the local shard
-                auto flattened = flattened_view(sharded);
-                // and group into scaling groups
-                auto grouped = shard_empty_container(std::move(flattened), 128);
+                // tensors in 'sharded' are already 1D; directly group into scaling groups
+                auto grouped = shard_empty_container(std::move(sharded), 128);
+                return grouped;
+            };
             // we "shard" for 128 as many GPUs, so that we get 1 scale per 128 weights.
-            // we "shard" for 128 as many GPUs, so that we get 1 scale per 128 weights.
+            // we first shard by mWorld (matching main weights), then shard the local
+            // flattened view by 128 to get 1 scale per 128 weights.
-            // we "shard" for 128 as many GPUs, so that we get 1 scale per 128 weights.
+            // we first shard by mWorld (matching main weights), then shard the local
+            // flattened view by 128 to get 1 scale per 128 weights.
             for (int i = 0; i < mConfig.NumLayers; ++i) {
-                mBlocksMScales[i] = shard_empty_container(model.create_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32), 128 * mWorld);
+                mBlocksMScales[i] = prepare_shape_for_scales(model.create_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32));
                 alloc_lazy.allocate(mBlocksMScales[i]);
                 alloc_lazy.commit(alloc, EAllocationType::ON_DEVICE, "m_block_scales");
                 visit([stream](Tensor& t){
                     fill_constant(t, 1.f, t.nelem(), stream);
                 }, mBlocksMScales[i]);
             }
-            mNonBlockMScales = shard_empty_container(model.create_non_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32), 128 * mWorld);
+            mNonBlockMScales = prepare_shape_for_scales(model.create_non_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32));
             alloc_lazy.allocate(mNonBlockMScales);
             alloc_lazy.commit(alloc, EAllocationType::ON_DEVICE, "m_nonblock_scales");
             visit([stream](Tensor& t){

diff --git a/src/utilities/tensor.cpp b/src/utilities/tensor.cpp
@@ -119,6 +119,14 @@ TensorShard shard_view(const Tensor& src, int idx, int num) {
     return TensorShard{shard, idx, num, src.Sizes};
 }
 
+Tensor flat_view(const Tensor& src) {
+    Tensor dst{src};
+    dst.Sizes.fill(0);
+    dst.Sizes[0] = src.nelem();
+    dst.Rank = 1;
+    return dst;
+}
+
 void visit(const std::function<void(Tensor&)>& func, SimpleTensorContainer& container) {
     auto cs = container.num_tensors();
     for(std::size_t i = 0; i < cs; ++i) {
@@ -168,6 +176,14 @@ GenericTensorContainer shard_empty_container(GenericTensorContainer&& c, int wor
     return std::move(c);
 }
 
+GenericTensorContainer flattened_view(const GenericTensorContainer& c) {
+    std::vector<Tensor> flats(c.num_tensors());
+    for (std::size_t i = 0; i < c.num_tensors(); ++i) {
+        flats.at(i) = flat_view(c.get_tensor(i));
+    }
+    return GenericTensorContainer{flats};
+}
+
 GenericTensorContainer shard_view(const GenericTensorContainer& c, int rank, int world) {
     std::vector<Tensor> shards(c.num_tensors());
     for (std::size_t i = 0; i < c.num_tensors(); ++i) {

diff --git a/src/utilities/tensor.h b/src/utilities/tensor.h
@@ -160,4 +160,6 @@ class TensorShard : public Tensor {
 };
 
 TensorShard shard_view(const Tensor& src, int idx, int num);
+Tensor flat_view(const Tensor& src);
+
 #endif //LLMQ_SRC_UTILS_TENSOR_H
diff --git a/src/utilities/tensor_container.h b/src/utilities/tensor_container.h
@@ -61,6 +61,9 @@ class GenericTensorContainer final : public SimpleTensorContainer {
 //! are `nullptr`, but sizes have been set up.
 GenericTensorContainer shard_empty_container(GenericTensorContainer&& c, int world);
 
+//! Flattens all tensors is the container.
-//! Flattens all tensors is the container.
+//! Flattens all tensors in the container.
-//! Flattens all tensors is the container.
+//! Flattens all tensors in the container.
+GenericTensorContainer flattened_view(const GenericTensorContainer& c);
+
 //! Shards a non-empty tensor container. The returned container's tensors are _views_ into
 //! the original container's tensors.
 GenericTensorContainer shard_view(const GenericTensorContainer& c, int rank, int world);