From 9a18aa8ac8626b50ea7ccef4d8ed8804825296f7 Mon Sep 17 00:00:00 2001
From: Roj234 <82699138+roj234@users.noreply.github.com>
Date: Sun, 15 Feb 2026 19:27:52 +0800
Subject: [PATCH 1/2] Fix unused allocation

---
 llama_cpp/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index df2ad74dc..98481aa88 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -526,8 +526,8 @@ def free_lora_adapter():
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
-            (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
-        )
+            (n_ctx, self._n_vocab), dtype=np.single
+        ) if self._logits_all else None
 
         self._mirostat_mu = ctypes.c_float(
             2.0 * 5.0

From 5340e707a4bec0185258a960cb4b6bb72b6c1ce5 Mon Sep 17 00:00:00 2001
From: Roj234 <82699138+roj234@users.noreply.github.com>
Date: Sun, 15 Feb 2026 19:32:33 +0800
Subject: [PATCH 2/2] Add runtime checks for logits_all

---
 llama_cpp/llama.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 98481aa88..147c4ef5b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -638,6 +638,10 @@ def _input_ids(self) -> npt.NDArray[np.intc]:
 
     @property
     def _scores(self) -> npt.NDArray[np.single]:
+        if not self._logits_all:
+            raise RuntimeError(
+                "Llama model must be created with logits_all=True to call this method"
+            )
         return self.scores[: self.n_tokens, :]
 
     @property
@@ -646,6 +650,10 @@ def eval_tokens(self) -> Deque[int]:
 
     @property
     def eval_logits(self) -> Deque[List[float]]:
+        if not self._logits_all:
+            raise RuntimeError(
+                "Llama model must be created with logits_all=True to call this method"
+            )
         return deque(
             self.scores[: self.n_tokens, :].tolist(),
             maxlen=self._n_ctx if self._logits_all else 1,
@@ -2434,10 +2442,11 @@ def save_state(self) -> LlamaState:
         )
 
     def load_state(self, state: LlamaState) -> None:
-        # Only filling in up to `n_tokens` and then zero-ing out the rest
-        self.scores[: state.n_tokens, :] = state.scores.copy()
-        rest = self.scores[state.n_tokens :, :]
-        rest[rest > 0] = 0.0
+        if self._logits_all:
+            # Only filling in up to `n_tokens` and then zero-ing out the rest
+            self.scores[: state.n_tokens, :] = state.scores.copy()
+            rest = self.scores[state.n_tokens :, :]
+            rest[rest > 0] = 0.0
         self.input_ids = state.input_ids.copy()
         self.n_tokens = state.n_tokens
         self._seed = state.seed