Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/kernels/fill.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ __global__ void fill_kernel(floatX* dst, floatX value, std::size_t count) {

template<typename floatX>
void fill_imp(floatX* dst, floatX value, std::size_t count, cudaStream_t stream) {
if (count == 0) return;
if (dst == nullptr) throw std::invalid_argument("dst is nullptr");
fill_kernel<<<div_ceil(count, static_cast<std::size_t>(256)), 256, 0, stream>>> (dst, value, count);
CUDA_CHECK(cudaGetLastError());
}
Expand Down
14 changes: 12 additions & 2 deletions src/training/adamw_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,17 +177,27 @@ void AdamWStateManager::allocate_state(IModel& model, cudaStream_t stream, EAllo
}

mBlocksMScales.resize(mConfig.NumLayers);

if(mMType == ETensorDType::FP8_E4M3) {
auto prepare_shape_for_scales = [&](auto&& c) {
// creates shards same as main weight
auto sharded = shard_empty_container(flattened_view(c), mWorld);
// flatten the local shard
auto flattened = flattened_view(sharded);
// and group into scaling groups
auto grouped = shard_empty_container(std::move(flattened), 128);
Comment on lines +185 to +188
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The second call to flattened_view(sharded) on this line is redundant. After shard_empty_container(flattened_view(c), mWorld), all tensors in the container are already 1D (rank 1), so applying flattened_view again produces the same shapes. The sharded container can be passed directly to shard_empty_container on the next line. Removing this call would simplify the logic and avoid allocating an unnecessary intermediate container.

Suggested change
// flatten the local shard
auto flattened = flattened_view(sharded);
// and group into scaling groups
auto grouped = shard_empty_container(std::move(flattened), 128);
// tensors in 'sharded' are already 1D; directly group into scaling groups
auto grouped = shard_empty_container(std::move(sharded), 128);

Copilot uses AI. Check for mistakes.
return grouped;
};
// we "shard" for 128 as many GPUs, so that we get 1 scale per 128 weights.
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment on this line ("we 'shard' for 128 as many GPUs, so that we get 1 scale per 128 weights") is outdated. The old code directly divided by 128 * mWorld; the new code first shards by mWorld (matching main weight sharding), then flattens the local shard, and shards by 128 to get one scale per 128 weights. The comment should be updated to reflect the two-step sharding in prepare_shape_for_scales.

Suggested change
// we "shard" for 128 as many GPUs, so that we get 1 scale per 128 weights.
// we first shard by mWorld (matching main weights), then shard the local
// flattened view by 128 to get 1 scale per 128 weights.

Copilot uses AI. Check for mistakes.
for (int i = 0; i < mConfig.NumLayers; ++i) {
mBlocksMScales[i] = shard_empty_container(model.create_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32), 128 * mWorld);
mBlocksMScales[i] = prepare_shape_for_scales(model.create_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32));
alloc_lazy.allocate(mBlocksMScales[i]);
alloc_lazy.commit(alloc, EAllocationType::ON_DEVICE, "m_block_scales");
visit([stream](Tensor& t){
fill_constant(t, 1.f, t.nelem(), stream);
}, mBlocksMScales[i]);
}
mNonBlockMScales = shard_empty_container(model.create_non_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32), 128 * mWorld);
mNonBlockMScales = prepare_shape_for_scales(model.create_non_block_container(mConfig, ETensorDType::FP32, ETensorDType::FP32));
alloc_lazy.allocate(mNonBlockMScales);
alloc_lazy.commit(alloc, EAllocationType::ON_DEVICE, "m_nonblock_scales");
visit([stream](Tensor& t){
Expand Down
16 changes: 16 additions & 0 deletions src/utilities/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,14 @@ TensorShard shard_view(const Tensor& src, int idx, int num) {
return TensorShard{shard, idx, num, src.Sizes};
}

Tensor flat_view(const Tensor& src) {
Tensor dst{src};
dst.Sizes.fill(0);
dst.Sizes[0] = src.nelem();
dst.Rank = 1;
return dst;
}

void visit(const std::function<void(Tensor&)>& func, SimpleTensorContainer& container) {
auto cs = container.num_tensors();
for(std::size_t i = 0; i < cs; ++i) {
Expand Down Expand Up @@ -168,6 +176,14 @@ GenericTensorContainer shard_empty_container(GenericTensorContainer&& c, int wor
return std::move(c);
}

GenericTensorContainer flattened_view(const GenericTensorContainer& c) {
std::vector<Tensor> flats(c.num_tensors());
for (std::size_t i = 0; i < c.num_tensors(); ++i) {
flats.at(i) = flat_view(c.get_tensor(i));
}
return GenericTensorContainer{flats};
}

GenericTensorContainer shard_view(const GenericTensorContainer& c, int rank, int world) {
std::vector<Tensor> shards(c.num_tensors());
for (std::size_t i = 0; i < c.num_tensors(); ++i) {
Expand Down
2 changes: 2 additions & 0 deletions src/utilities/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,6 @@ class TensorShard : public Tensor {
};

TensorShard shard_view(const Tensor& src, int idx, int num);
Tensor flat_view(const Tensor& src);

#endif //LLMQ_SRC_UTILS_TENSOR_H
3 changes: 3 additions & 0 deletions src/utilities/tensor_container.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class GenericTensorContainer final : public SimpleTensorContainer {
//! are `nullptr`, but sizes have been set up.
GenericTensorContainer shard_empty_container(GenericTensorContainer&& c, int world);

//! Flattens all tensors is the container.
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doc comment says "Flattens all tensors is the container" — "is" should be "in".

Suggested change
//! Flattens all tensors is the container.
//! Flattens all tensors in the container.

Copilot uses AI. Check for mistakes.
GenericTensorContainer flattened_view(const GenericTensorContainer& c);

//! Shards a non-empty tensor container. The returned container's tensors are _views_ into
//! the original container's tensors.
GenericTensorContainer shard_view(const GenericTensorContainer& c, int rank, int world);
Expand Down