From 0d3ee65244b6a195593aa3b0023cd042b7be97ee Mon Sep 17 00:00:00 2001
From: Paul Flynn <paul@arkavo.com>
Date: Wed, 11 Mar 2026 21:28:09 -0400
Subject: [PATCH 1/6] ggml : transpose fused GDN state access for coalesced
 memory reads (#20436)

The fused Gated Delta Net kernel accessed the [S_v, S_v] state matrix
column-wise on row-major storage, causing strided reads (stride S_v =
128 floats = 512 bytes) that waste GPU cache bandwidth. This produced a
39% regression on Qwen3.5-9B (Metal, M4 Max) compared to the unfused
path.

Transpose the state indexing so threads read contiguously:
- Metal: s_ptr[is*S_v] -> s_ptr[is] (stride 1 vs S_v)
- CUDA:  curr_state[i*S_v+col] -> curr_state[col*S_v+i] (coalesced)
- CPU:   restructured loops for row-wise transposed access

Also add --fused-gdn [on|off|auto] CLI flag (mirrors --flash-attn) so
users can control fused GDN independently of auto-detection.

All GATED_DELTA_NET backend-ops tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 common/arg.cpp                        | 15 ++++++++++
 common/common.cpp                     |  1 +
 common/common.h                       |  1 +
 ggml/src/ggml-cpu/ops.cpp             | 42 +++++++++++++++++----------
 ggml/src/ggml-cuda/gated_delta_net.cu |  7 +++--
 ggml/src/ggml-metal/ggml-metal.metal  |  9 +++---
 include/llama.h                       |  9 ++++++
 src/llama-context.cpp                 |  8 +++--
 src/llama.cpp                         | 12 ++++++++
 9 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 41da8563d63..0f7caf1c8f4 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1339,6 +1339,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                                    string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
                            }
                        }).set_env("LLAMA_ARG_FLASH_ATTN"));
+    add_opt(common_arg({ "-fgdn", "--fused-gdn" }, "[on|off|auto]",
+                       string_format("set fused Gated Delta Net kernel use ('on', 'off', or 'auto', default: '%s')",
+                                     llama_fused_gdn_type_name(params.fused_gdn_type)),
+                       [](common_params & params, const std::string & value) {
+                           if (is_truthy(value)) {
+                               params.fused_gdn_type = LLAMA_FUSED_GDN_TYPE_ENABLED;
+                           } else if (is_falsey(value)) {
+                               params.fused_gdn_type = LLAMA_FUSED_GDN_TYPE_DISABLED;
+                           } else if (is_autoy(value)) {
+                               params.fused_gdn_type = LLAMA_FUSED_GDN_TYPE_AUTO;
+                           } else {
+                               throw std::runtime_error(
+                                   string_format("error: unknown value for --fused-gdn: '%s'\n", value.c_str()));
+                           }
+                       }).set_env("LLAMA_ARG_FUSED_GDN"));
     add_opt(common_arg(
         {"-p", "--prompt"}, "PROMPT",
         "prompt to start generation with; for system message, use -sys",
diff --git a/common/common.cpp b/common/common.cpp
index cc423d3439f..2e7ae7b0f31 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1369,6 +1369,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.pooling_type      = params.pooling_type;
     cparams.attention_type    = params.attention_type;
     cparams.flash_attn_type   = params.flash_attn_type;
+    cparams.fused_gdn_type    = params.fused_gdn_type;
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
diff --git a/common/common.h b/common/common.h
index c5645bba460..0b30f0c8aab 100644
--- a/common/common.h
+++ b/common/common.h
@@ -411,6 +411,7 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
     enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
+    enum llama_fused_gdn_type    fused_gdn_type    = LLAMA_FUSED_GDN_TYPE_AUTO;  // whether to use fused Gated Delta Net kernel
 
     struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index fa9d27046b5..13ca087faf2 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -10477,34 +10477,46 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
             const float beta_val = *(const float *)((const char *)src_beta->data + iv3 * nbb3 + t * nbb2 + iv1 * nbb1);
             const float * g_d    =  (const float *)((const char *)src_g->data    + iv3 * nbg3 + t * nbg2 + iv1 * nbg1);
 
+            // state is stored transposed: s_out[j*S_v + i] = S[i][j]
+            // so row j of s_out = column j of S (contiguous access)
+
             if (kda) {
+                // precompute exp(g) into delta scratch (reused below)
                 for (int64_t i = 0; i < S_v; ++i) {
-                    ggml_vec_scale_f32(S_v, &s_out[i * S_v], expf(g_d[i]));
+                    delta[i] = expf(g_d[i]);
+                }
+                // S[i][:] *= exp(g[i]) => for each row j of M: M[j][i] *= exp(g[i])
+                for (int64_t j = 0; j < S_v; ++j) {
+                    ggml_vec_mul_f32(S_v, &s_out[j * S_v], &s_out[j * S_v], delta);
                 }
             } else {
                 ggml_vec_scale_f32(S_v * S_v, s_out, expf(g_d[0]));
             }
 
-            // delta[j] = sum_i S[j][i] * k[i]
-            memset(delta, 0, S_v * sizeof(float));
-            for (int64_t i = 0; i < S_v; ++i) {
-                ggml_vec_mad_f32(S_v, delta, &s_out[i * S_v], k_d[i]);
-            }
+            // delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
             for (int64_t j = 0; j < S_v; ++j) {
-                delta[j] = (v_d[j] - delta[j]) * beta_val;
+                const float * row_j = s_out + j * S_v;
+                float sum = 0.0f;
+                for (int64_t i = 0; i < S_v; ++i) {
+                    sum += row_j[i] * k_d[i];
+                }
+                delta[j] = (v_d[j] - sum) * beta_val;
             }
 
-            // outer product: S[j][i] += k[i] * delta[j]
-            for (int64_t i = 0; i < S_v; ++i) {
-                ggml_vec_mad_f32(S_v, &s_out[i * S_v], delta, k_d[i]);
+            // outer product: S[i][j] += k[i] * delta[j] => M[j][i] += delta[j] * k[i]
+            for (int64_t j = 0; j < S_v; ++j) {
+                ggml_vec_mad_f32(S_v, &s_out[j * S_v], k_d, delta[j]);
             }
 
-            // attn_out[j] = sum_i S[j][i] * q[i]
-            memset(attn_data, 0, S_v * sizeof(float));
-            for (int64_t i = 0; i < S_v; ++i) {
-                ggml_vec_mad_f32(S_v, attn_data, &s_out[i * S_v], q_d[i]);
+            // attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
+            for (int64_t j = 0; j < S_v; ++j) {
+                const float * row_j = s_out + j * S_v;
+                float sum = 0.0f;
+                for (int64_t i = 0; i < S_v; ++i) {
+                    sum += row_j[i] * q_d[i];
+                }
+                attn_data[j] = sum * scale;
             }
-            ggml_vec_scale_f32(S_v, attn_data, scale);
 
             attn_data += S_v * H; // advance to next token
         }
diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
index 5f0fa8e58df..eb38588202a 100644
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
@@ -45,10 +45,11 @@ __global__ void gated_delta_net_cuda(const float * q,
     static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size");
     constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
     float         s_shard[rows_per_lane];
+    // state is stored transposed: M[col][i] = S[i][col], row col is contiguous
 #pragma unroll
     for (int r = 0; r < rows_per_lane; r++) {
         const int i = r * warp_size + lane;
-        s_shard[r]  = curr_state[i * S_v + col];
+        s_shard[r]  = curr_state[col * S_v + i];
     }
 
     for (int t = 0; t < n_tokens; t++) {
@@ -126,11 +127,11 @@ __global__ void gated_delta_net_cuda(const float * q,
         attn_data += S_v * H;
     }
 
-    // Write state back to global memory
+    // Write state back to global memory (transposed layout)
 #pragma unroll
     for (int r = 0; r < rows_per_lane; r++) {
         const int i          = r * warp_size + lane;
-        state[i * S_v + col] = s_shard[r];
+        state[col * S_v + i] = s_shard[r];
     }
 }
 
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 0b77d5349b8..e06f266b533 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2466,13 +2466,14 @@ kernel void kernel_gated_delta_net_impl(
 
     const float scale = 1.0f / sqrt((float)S_v);
 
-    device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20;
+    // state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
+    device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
 
     float ls[NSG];
 
     FOR_UNROLL (short j = 0; j < NSG; j++) {
         const short is = tx*NSG + j;
-        ls[j] = s_ptr[is*S_v];
+        ls[j] = s_ptr[is];
     }
 
     device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
@@ -2533,11 +2534,11 @@ kernel void kernel_gated_delta_net_impl(
         g_ptr += args.ne21*G;
     }
 
-    device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20;
+    device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
 
     FOR_UNROLL (short j = 0; j < NSG; j++) {
         const short is = tx*NSG + j;
-        dst_state[is*S_v] = ls[j];
+        dst_state[is] = ls[j];
     }
 
 #undef S_v
diff --git a/include/llama.h b/include/llama.h
index c6e102abe51..05e5152d0db 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -190,6 +190,14 @@ extern "C" {
 
     LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
 
+    enum llama_fused_gdn_type {
+        LLAMA_FUSED_GDN_TYPE_AUTO     = -1,
+        LLAMA_FUSED_GDN_TYPE_DISABLED = 0,
+        LLAMA_FUSED_GDN_TYPE_ENABLED  = 1,
+    };
+
+    LLAMA_API const char * llama_fused_gdn_type_name(enum llama_fused_gdn_type fused_gdn_type);
+
     enum llama_split_mode {
         LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
         LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -338,6 +346,7 @@ extern "C" {
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
         enum llama_attention_type    attention_type;    // attention type to use for embeddings
         enum llama_flash_attn_type   flash_attn_type;   // when to enable Flash Attention
+        enum llama_fused_gdn_type    fused_gdn_type;    // when to enable fused Gated Delta Net kernel
 
         // ref: https://github.com/ggml-org/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0be94939102..5deb81f8b17 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -150,9 +150,9 @@ llama_context::llama_context(
     cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
     cparams.auto_fa    = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
 
-    cparams.fused_gdn_ar = true;
-    cparams.fused_gdn_ch = true;
-    cparams.auto_fgdn    = true;
+    cparams.fused_gdn_ar = params.fused_gdn_type != LLAMA_FUSED_GDN_TYPE_DISABLED;
+    cparams.fused_gdn_ch = params.fused_gdn_type != LLAMA_FUSED_GDN_TYPE_DISABLED;
+    cparams.auto_fgdn    = params.fused_gdn_type == LLAMA_FUSED_GDN_TYPE_AUTO;
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -200,6 +200,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
     LLAMA_LOG_INFO("%s: flash_attn    = %s\n",   __func__, llama_flash_attn_type_name(params.flash_attn_type));
+    LLAMA_LOG_INFO("%s: fused_gdn     = %s\n",   __func__, llama_fused_gdn_type_name(params.fused_gdn_type));
     LLAMA_LOG_INFO("%s: kv_unified    = %s\n",   __func__, cparams.kv_unified ? "true" : "false");
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
@@ -2869,6 +2870,7 @@ llama_context_params llama_context_default_params() {
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
         /*.flash_attn_type             =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
+        /*.fused_gdn_type              =*/ LLAMA_FUSED_GDN_TYPE_AUTO,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,
diff --git a/src/llama.cpp b/src/llama.cpp
index 872e659edca..4bc996bbee9 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -45,6 +45,18 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
     GGML_ABORT("fatal error");
 }
 
+const char * llama_fused_gdn_type_name(enum llama_fused_gdn_type fused_gdn_type) {
+    switch (fused_gdn_type) {
+        case LLAMA_FUSED_GDN_TYPE_AUTO:
+            return "auto";
+        case LLAMA_FUSED_GDN_TYPE_DISABLED:
+            return "disabled";
+        case LLAMA_FUSED_GDN_TYPE_ENABLED:
+            return "enabled";
+    }
+    GGML_ABORT("fatal error");
+}
+
 struct llama_device_memory_data {
     int64_t total;
     int64_t free;

From 9ec08b3d2bfb082dc2f928f6df0d86a40afb1930 Mon Sep 17 00:00:00 2001
From: Paul Flynn <paul@arkavo.com>
Date: Wed, 11 Mar 2026 22:53:50 -0400
Subject: [PATCH 2/6] ggml : use SIMD dot products in CPU GDN kernel, couple
 AR/chunked fused flags

- Replace scalar inner loops with ggml_vec_dot_f32 for SIMD-optimized
  dot products in the CPU fused GDN kernel (delta and attention output)
- Couple fused_gdn_ar and fused_gdn_ch flags in auto-detection: if one
  path lacks device support, disable both to prevent state layout mismatch
  between transposed (fused) and non-transposed (unfused) formats

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ggml/src/ggml-cpu/ops.cpp | 10 ++--------
 src/llama-context.cpp     |  8 ++++++++
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 13ca087faf2..e027325cd0a 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -10495,11 +10495,8 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
 
             // delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
             for (int64_t j = 0; j < S_v; ++j) {
-                const float * row_j = s_out + j * S_v;
                 float sum = 0.0f;
-                for (int64_t i = 0; i < S_v; ++i) {
-                    sum += row_j[i] * k_d[i];
-                }
+                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
                 delta[j] = (v_d[j] - sum) * beta_val;
             }
 
@@ -10510,11 +10507,8 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
 
             // attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
             for (int64_t j = 0; j < S_v; ++j) {
-                const float * row_j = s_out + j * S_v;
                 float sum = 0.0f;
-                for (int64_t i = 0; i < S_v; ++i) {
-                    sum += row_j[i] * q_d[i];
-                }
+                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
                 attn_data[j] = sum * scale;
             }
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5deb81f8b17..4586bd07192 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -538,6 +538,14 @@ void llama_context::sched_reserve() {
             }
         }
 
+        // the fused kernel uses a transposed state layout, so both paths must agree
+        // to avoid a state layout mismatch when switching between AR and chunked
+        if (cparams.fused_gdn_ar != cparams.fused_gdn_ch) {
+            cparams.fused_gdn_ar = false;
+            cparams.fused_gdn_ch = false;
+            LLAMA_LOG_WARN("%s: fused Gated Delta Net AR/chunked support mismatch, disabling both\n", __func__);
+        }
+
         cparams.auto_fgdn = false;
     }
 

From 1b82571e4f5f71d9a989f1b076bbe444e22d78c0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Mar 2026 08:29:28 +0200
Subject: [PATCH 3/6] llama : rever fgdn argument changes

---
 common/arg.cpp        | 15 ---------------
 common/common.cpp     |  1 -
 common/common.h       |  1 -
 include/llama.h       |  9 ---------
 src/llama-context.cpp | 16 +++-------------
 src/llama.cpp         | 12 ------------
 6 files changed, 3 insertions(+), 51 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 0f7caf1c8f4..41da8563d63 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1339,21 +1339,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                                    string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
                            }
                        }).set_env("LLAMA_ARG_FLASH_ATTN"));
-    add_opt(common_arg({ "-fgdn", "--fused-gdn" }, "[on|off|auto]",
-                       string_format("set fused Gated Delta Net kernel use ('on', 'off', or 'auto', default: '%s')",
-                                     llama_fused_gdn_type_name(params.fused_gdn_type)),
-                       [](common_params & params, const std::string & value) {
-                           if (is_truthy(value)) {
-                               params.fused_gdn_type = LLAMA_FUSED_GDN_TYPE_ENABLED;
-                           } else if (is_falsey(value)) {
-                               params.fused_gdn_type = LLAMA_FUSED_GDN_TYPE_DISABLED;
-                           } else if (is_autoy(value)) {
-                               params.fused_gdn_type = LLAMA_FUSED_GDN_TYPE_AUTO;
-                           } else {
-                               throw std::runtime_error(
-                                   string_format("error: unknown value for --fused-gdn: '%s'\n", value.c_str()));
-                           }
-                       }).set_env("LLAMA_ARG_FUSED_GDN"));
     add_opt(common_arg(
         {"-p", "--prompt"}, "PROMPT",
         "prompt to start generation with; for system message, use -sys",
diff --git a/common/common.cpp b/common/common.cpp
index 2e7ae7b0f31..cc423d3439f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1369,7 +1369,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.pooling_type      = params.pooling_type;
     cparams.attention_type    = params.attention_type;
     cparams.flash_attn_type   = params.flash_attn_type;
-    cparams.fused_gdn_type    = params.fused_gdn_type;
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
diff --git a/common/common.h b/common/common.h
index 0b30f0c8aab..c5645bba460 100644
--- a/common/common.h
+++ b/common/common.h
@@ -411,7 +411,6 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
     enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
-    enum llama_fused_gdn_type    fused_gdn_type    = LLAMA_FUSED_GDN_TYPE_AUTO;  // whether to use fused Gated Delta Net kernel
 
     struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
diff --git a/include/llama.h b/include/llama.h
index 05e5152d0db..c6e102abe51 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -190,14 +190,6 @@ extern "C" {
 
     LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
 
-    enum llama_fused_gdn_type {
-        LLAMA_FUSED_GDN_TYPE_AUTO     = -1,
-        LLAMA_FUSED_GDN_TYPE_DISABLED = 0,
-        LLAMA_FUSED_GDN_TYPE_ENABLED  = 1,
-    };
-
-    LLAMA_API const char * llama_fused_gdn_type_name(enum llama_fused_gdn_type fused_gdn_type);
-
     enum llama_split_mode {
         LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
         LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
@@ -346,7 +338,6 @@ extern "C" {
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
         enum llama_attention_type    attention_type;    // attention type to use for embeddings
         enum llama_flash_attn_type   flash_attn_type;   // when to enable Flash Attention
-        enum llama_fused_gdn_type    fused_gdn_type;    // when to enable fused Gated Delta Net kernel
 
         // ref: https://github.com/ggml-org/llama.cpp/pull/2054
         float    rope_freq_base;   // RoPE base frequency, 0 = from model
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4586bd07192..0be94939102 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -150,9 +150,9 @@ llama_context::llama_context(
     cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
     cparams.auto_fa    = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
 
-    cparams.fused_gdn_ar = params.fused_gdn_type != LLAMA_FUSED_GDN_TYPE_DISABLED;
-    cparams.fused_gdn_ch = params.fused_gdn_type != LLAMA_FUSED_GDN_TYPE_DISABLED;
-    cparams.auto_fgdn    = params.fused_gdn_type == LLAMA_FUSED_GDN_TYPE_AUTO;
+    cparams.fused_gdn_ar = true;
+    cparams.fused_gdn_ch = true;
+    cparams.auto_fgdn    = true;
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -200,7 +200,6 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
     LLAMA_LOG_INFO("%s: causal_attn   = %d\n",   __func__, cparams.causal_attn);
     LLAMA_LOG_INFO("%s: flash_attn    = %s\n",   __func__, llama_flash_attn_type_name(params.flash_attn_type));
-    LLAMA_LOG_INFO("%s: fused_gdn     = %s\n",   __func__, llama_fused_gdn_type_name(params.fused_gdn_type));
     LLAMA_LOG_INFO("%s: kv_unified    = %s\n",   __func__, cparams.kv_unified ? "true" : "false");
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
@@ -538,14 +537,6 @@ void llama_context::sched_reserve() {
             }
         }
 
-        // the fused kernel uses a transposed state layout, so both paths must agree
-        // to avoid a state layout mismatch when switching between AR and chunked
-        if (cparams.fused_gdn_ar != cparams.fused_gdn_ch) {
-            cparams.fused_gdn_ar = false;
-            cparams.fused_gdn_ch = false;
-            LLAMA_LOG_WARN("%s: fused Gated Delta Net AR/chunked support mismatch, disabling both\n", __func__);
-        }
-
         cparams.auto_fgdn = false;
     }
 
@@ -2878,7 +2869,6 @@ llama_context_params llama_context_default_params() {
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
         /*.flash_attn_type             =*/ LLAMA_FLASH_ATTN_TYPE_AUTO,
-        /*.fused_gdn_type              =*/ LLAMA_FUSED_GDN_TYPE_AUTO,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
         /*.yarn_ext_factor             =*/ -1.0f,
diff --git a/src/llama.cpp b/src/llama.cpp
index 4bc996bbee9..872e659edca 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -45,18 +45,6 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
     GGML_ABORT("fatal error");
 }
 
-const char * llama_fused_gdn_type_name(enum llama_fused_gdn_type fused_gdn_type) {
-    switch (fused_gdn_type) {
-        case LLAMA_FUSED_GDN_TYPE_AUTO:
-            return "auto";
-        case LLAMA_FUSED_GDN_TYPE_DISABLED:
-            return "disabled";
-        case LLAMA_FUSED_GDN_TYPE_ENABLED:
-            return "enabled";
-    }
-    GGML_ABORT("fatal error");
-}
-
 struct llama_device_memory_data {
     int64_t total;
     int64_t free;

From 7ea6ee4fa578c69df262bdc6ff5b9c1ded995f0f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Mar 2026 08:30:40 +0200
Subject: [PATCH 4/6] graph : remove GDN state transposes

---
 src/models/delta-net-base.cpp | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp
index a62dbc15dd0..6bc989c9509 100644
--- a/src/models/delta-net-base.cpp
+++ b/src/models/delta-net-base.cpp
@@ -225,9 +225,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
     ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
     cb(kg_t, "key_gdiff_t", il);
 
-    ggml_tensor * s_t = ggml_transpose(ctx0, s);
-    s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
-    cb(s_t, "dnet_add_ch_state", il);
+    s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
+    cb(s, "dnet_add_ch_state", il);
 
     // [CS, S_v, n_chunks, H_v * n_seqs]
     ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
@@ -240,7 +239,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
         ggml_tensor * ch_kg_t    = get_slice_2d(ctx0, kg_t,    chunk); // [ CS, S_k, 1, H_v * n_seqs]
 
         // [CS, S_v, 1, H_v * n_seqs]
-        ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
+        ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
         cb(v_t_p, "v_prime", il);
 
         // [CS, S_v, 1, H_v * n_seqs]
@@ -252,7 +251,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
         cb(v_attn, "v_attn", il);
 
         // [S_v, CS, 1, H_v * n_seqs]
-        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
+        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
         cb(attn_inter, "attn_inter", il);
 
         // [S_v, CS, 1, H_v * n_seqs]
@@ -268,13 +267,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
         // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
         ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
 
-        s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
-        s_t = ggml_add(ctx0, s_t, kgv);
-        cb(s_t, "dnet_add_ch_state", il);
+        s = ggml_mul(ctx0, s, ch_g_last_exp_t);
+        s = ggml_add(ctx0, s, kgv);
+        cb(s, "dnet_add_ch_state", il);
     }
 
-    s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
-
     // truncate padded tokens
     ggml_tensor * o = ggml_view_4d(ctx0, v,
             S_v, n_tokens, H_v, n_seqs,
@@ -282,7 +279,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
             ggml_row_size(v->type, S_v * CS * n_chunks),
             ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
     o = ggml_permute  (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
-    s = ggml_transpose(ctx0, s_t);
+    s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
     cb(s, "output_state", il);
 
     return {o, s};
@@ -341,11 +338,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
     g = ggml_exp(ctx0, g);
     s = ggml_mul(ctx0, s, g);
 
-    ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
-
     // [1, S_v, H_v, n_seqs]
     ggml_tensor * sk;
-    sk = ggml_mul     (ctx0, s_t, k);
+    sk = ggml_mul     (ctx0, s, k);
     sk = ggml_sum_rows(ctx0, sk);
 
     // [S_v, 1, H_v, n_seqs]
@@ -362,15 +357,14 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
     k  = ggml_repeat(ctx0, k, s);
     kd = ggml_mul   (ctx0, k, d_t);
 
-    s_t = ggml_add(ctx0, s_t, kd);
+    s = ggml_add(ctx0, s, kd);
 
-    cb(s_t, "dnet_add_ar_state", il);
+    cb(s, "dnet_add_ar_state", il);
 
-    ggml_tensor * s_q = ggml_mul     (ctx0, s_t, q);
+    ggml_tensor * s_q = ggml_mul     (ctx0, s, q);
     ggml_tensor * o   = ggml_sum_rows(ctx0, s_q);
 
     o = ggml_permute  (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
-    s = ggml_transpose(ctx0, s_t);           // [S_v, S_v, H_v, n_seqs]
 
     return {o, s};
 }

From 80c9e50446b5e41684efcf2dce584a854586b741 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 12 Mar 2026 15:19:21 +0100
Subject: [PATCH 5/6] CUDA: GDN hide memory latency

---
 ggml/src/ggml-cuda/gated_delta_net.cu | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
index eb38588202a..bf161504235 100644
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
@@ -1,7 +1,8 @@
 #include "gated_delta_net.cuh"
 
 template <int S_v, bool KDA>
-__global__ void gated_delta_net_cuda(const float * q,
+__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
+gated_delta_net_cuda(const float * q,
                                      const float * k,
                                      const float * v,
                                      const float * g,
@@ -63,6 +64,16 @@ __global__ void gated_delta_net_cuda(const float * q,
 
         const float beta_val = *beta_t;
 
+        // Cache k and q in registers
+        float k_reg[rows_per_lane];
+        float q_reg[rows_per_lane];
+#pragma unroll
+        for (int r = 0; r < rows_per_lane; r++) {
+            const int i = r * warp_size + lane;
+            k_reg[r] = k_t[i];
+            q_reg[r] = q_t[i];
+        }
+
         if constexpr (!KDA) {
             const float g_val = expf(*g_t);
 
@@ -70,8 +81,7 @@ __global__ void gated_delta_net_cuda(const float * q,
             float kv_shard = 0.0f;
 #pragma unroll
             for (int r = 0; r < rows_per_lane; r++) {
-                const int i = r * warp_size + lane;
-                kv_shard += s_shard[r] * k_t[i];
+                kv_shard += s_shard[r] * k_reg[r];
             }
             float kv_col = warp_reduce_sum<warp_size>(kv_shard);
 
@@ -83,9 +93,8 @@ __global__ void gated_delta_net_cuda(const float * q,
             float attn_partial = 0.0f;
 #pragma unroll
             for (int r = 0; r < rows_per_lane; r++) {
-                const int i = r * warp_size + lane;
-                s_shard[r]  = g_val * s_shard[r] + k_t[i] * delta_col;
-                attn_partial += s_shard[r] * q_t[i];
+                s_shard[r]  = g_val * s_shard[r] + k_reg[r] * delta_col;
+                attn_partial += s_shard[r] * q_reg[r];
             }
 
             float attn_col = warp_reduce_sum<warp_size>(attn_partial);
@@ -99,7 +108,7 @@ __global__ void gated_delta_net_cuda(const float * q,
 #pragma unroll
             for (int r = 0; r < rows_per_lane; r++) {
                 const int i = r * warp_size + lane;
-                kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i];
+                kv_shard += expf(g_t[i]) * s_shard[r] * k_reg[r];
             }
 
             float kv_col = warp_reduce_sum<warp_size>(kv_shard);
@@ -113,8 +122,8 @@ __global__ void gated_delta_net_cuda(const float * q,
 #pragma unroll
             for (int r = 0; r < rows_per_lane; r++) {
                 const int i = r * warp_size + lane;
-                s_shard[r]  = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col;
-                attn_partial += s_shard[r] * q_t[i];
+                s_shard[r]  = expf(g_t[i]) * s_shard[r] + k_reg[r] * delta_col;
+                attn_partial += s_shard[r] * q_reg[r];
             }
 
             float attn_col = warp_reduce_sum<warp_size>(attn_partial);

From 7897a63c94c351c78112c3ea2c7e02912e5d132c Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 13 Mar 2026 17:59:27 +0100
Subject: [PATCH 6/6] move curr_state to correct pos outside the loop

---
 ggml/src/ggml-cuda/gated_delta_net.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
index bf161504235..7f644ae0f46 100644
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
@@ -39,7 +39,7 @@ gated_delta_net_cuda(const float * q,
 
     const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
     state += state_offset;
-    curr_state += state_offset;
+    curr_state += state_offset + col * S_v;
     attn_data += (sequence * n_tokens * H + h_idx) * S_v;
 
     constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
@@ -47,10 +47,11 @@ gated_delta_net_cuda(const float * q,
     constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
     float         s_shard[rows_per_lane];
     // state is stored transposed: M[col][i] = S[i][col], row col is contiguous
+
 #pragma unroll
     for (int r = 0; r < rows_per_lane; r++) {
         const int i = r * warp_size + lane;
-        s_shard[r]  = curr_state[col * S_v + i];
+        s_shard[r]  = curr_state[i];
     }
 
     for (int t = 0; t < n_tokens; t++) {