l3utterfly
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/arg.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 1 deletion b/‎common/common.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/console.cpp‎
Lines changed: 31 additions & 3 deletions b/‎common/console.cpp‎
Lines changed: 31 additions & 3 deletions
diff --git a/‎common/console.h‎
Lines changed: 5 additions & 0 deletions b/‎common/console.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎common/debug.h‎
Lines changed: 1 addition & 1 deletion b/‎common/debug.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/jinja/README.md‎
Lines changed: 1 addition & 1 deletion b/‎common/jinja/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 8 additions & 8 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/backend/CANN.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/backend/CANN.md‎
Lines changed: 2 additions & 2 deletions
@@ -159,7 +159,7 @@ Maintainers reserve the right to decline review or close pull requests for any r
 
 # Code maintenance
 
-- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
   - Reviewing and merging related PRs
   - Fixing related bugs
   - Providing developer guidance/support
 
@@ -287,7 +287,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
-| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
+| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
 
 ## Obtaining and quantizing models
 
@@ -2399,7 +2399,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.fit_params = false;
             } else {
                 throw std::runtime_error(
-                    string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
+                    string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
             }
         }
     ).set_env("LLAMA_ARG_FIT"));
 
@@ -869,7 +869,7 @@ std::string common_detokenize(
 // Embedding utils
 //
 
-// TODO: repace embd_norm with an enum
+// TODO: replace embd_norm with an enum
 void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 
@@ -80,6 +80,8 @@ namespace console {
     static termios      initial_state;
 #endif
 
+    static completion_callback completion_cb = nullptr;
+
     //
     // Init and cleanup
     //
@@ -493,7 +495,7 @@ namespace console {
     }
 
     static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
-                                  size_t & byte_pos) {
+                                  size_t & byte_pos, int cursor_byte_pos = -1) {
         move_to_line_start(char_pos, byte_pos, widths);
         clear_current_line(widths);
 
@@ -503,6 +505,7 @@ namespace console {
         char_pos = 0;
 
         size_t idx = 0;
+        int back_width = 0;
         while (idx < line.size()) {
             size_t advance = 0;
             char32_t cp = decode_utf8(line, idx, advance);
@@ -511,8 +514,15 @@ namespace console {
             if (real_width < 0) real_width = 0;
             widths.push_back(real_width);
             idx += advance;
-            ++char_pos;
-            byte_pos = idx;
+            if (cursor_byte_pos >= 0 && static_cast<size_t>(cursor_byte_pos) < idx) {
+                back_width += real_width;
+            } else {
+                ++char_pos;
+                byte_pos = idx;
+            }
+        }
+        if (cursor_byte_pos >= 0) {
+            move_cursor(-back_width);
         }
     }
 
@@ -784,6 +794,20 @@ namespace console {
                 break;
             }
 
+            if (completion_cb && input_char == '\t') {
+                auto candidates = completion_cb(line, byte_pos);
+
+                if (!candidates.empty()) {
+                    if (candidates.size() > 1 || candidates[0].first != line) {
+                        // TODO?: Display all candidates
+                        set_line_contents(candidates[0].first, line, widths, char_pos, byte_pos, candidates[0].second);
+                    } else {
+                        // TODO: Move cursor to new byte_pos
+                    }
+                    continue;
+                }
+            }
+
             if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
                 end_of_stream = true;
                 break;
@@ -1062,6 +1086,10 @@ namespace console {
         return readline_advanced(line, multiline_input);
     }
 
+    void set_completion_callback(completion_callback cb) {
+        completion_cb = cb;
+    }
+
     namespace spinner {
         static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
         static std::condition_variable cv_stop;
 
@@ -4,7 +4,9 @@
 
 #include "common.h"
 
+#include <functional>
 #include <string>
+#include <vector>
 
 enum display_type {
     DISPLAY_TYPE_RESET = 0,
@@ -21,6 +23,9 @@ namespace console {
     void set_display(display_type display);
     bool readline(std::string & line, bool multiline_input);
 
+    using completion_callback = std::function<std::vector<std::pair<std::string, size_t>>(std::string_view, size_t)>;
+    void set_completion_callback(completion_callback cb);
+
     namespace spinner {
         void start();
         void stop();
 
@@ -18,7 +18,7 @@ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml
 // prints tensors that are processed in the computation graph
 // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
 // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determins whether an error should be thrown whenever a NaN is encountered
+// The template parameter determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
 template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
 
@@ -63,7 +63,7 @@ The llama.cpp Jinja engine introduces `jinja::string` (see `jinja/string.h`), wh
   - **One-to-many** (e.g., split): result is marked `is_input` **only if ALL** input parts are marked `is_input`
   - **Many-to-one** (e.g., join): same as one-to-many
 
-For string concatenation, string parts will be appended to the new string as-is, while perserving the `is_input` flag.
+For string concatenation, string parts will be appended to the new string as-is, while preserving the `is_input` flag.
 
 **Enabling Input Marking:**
 
 
@@ -4031,7 +4031,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 # split Conv3D into Conv2Ds
                 c1, c2, kt, kh, kw = data_torch.shape
                 del c1, c2, kh, kw  # unused
-                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+                assert kt == 2, "Current implementation only support temporal_patch_size of 2"
                 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
                 yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
             else:
@@ -4842,12 +4842,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Qwen3_5ForConditionalGeneration")
+@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
 class Qwen3_5TextModel(_LinearAttentionVReorderBase):
     model_arch = gguf.MODEL_ARCH.QWEN35
 
 
-@ModelBase.register("Qwen3_5MoeForConditionalGeneration")
+@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
 class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
     model_arch = gguf.MODEL_ARCH.QWEN35MOE
 
@@ -5404,7 +5404,7 @@ def set_gguf_parameters(self):
         # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
         linear_attn_config = self.hparams["linear_attn_config"]
         # n_head == 0 for KDA layers, n_head > 0 for MLA layers
-        # full_attention_layers list will be used to distingush layer type
+        # full_attention_layers list will be used to distinguish layer type
         _num_kv_heads = list()
         _full_attn_layers = linear_attn_config["full_attn_layers"]
         for il in range(self.hparams["num_hidden_layers"]):
@@ -6505,7 +6505,7 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
         self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
-        # default values below are taken from HF tranformers code
+        # default values below are taken from HF transformers code
         self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
         self.gguf_writer.add_vision_use_gelu(True)
         # calculate proj_scale_factor (used by tinygemma3 test model)
@@ -7097,7 +7097,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
             if bid == 0 and "time_mix_a" in new_name:
                 # dummy v0/v1/v2 on first layer
-                # easist way to make llama happy
+                # easiest way to make llama happy
                 yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
 
             yield (new_name, data_torch)
@@ -9596,7 +9596,7 @@ def __init__(self, *args, **kwargs):
         # NOTE: Explicitly include hparam prefix prefix for d_model to
         #   disambiguate with top-level head_dim
         # NOTE 2: If needed for future models, this can be isolated in a method
-        #   to separate the prefix setting and teh keys used
+        #   to separate the prefix setting and the keys used
         self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
         self.n_group = self.find_hparam(["n_groups", "num_groups"])
         self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
@@ -9743,7 +9743,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_value_length(self.head_dim)
 
         # Set feed_forward_length
-        # NOTE: This will trigger an override warning. This is preferrable to
+        # NOTE: This will trigger an override warning. This is preferable to
         #   duplicating all the parent logic
         if not self.is_moe:
             n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
 
@@ -20,7 +20,7 @@
 
 **Llama.cpp + CANN**
 
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are integrated to CANN Toolkit and kernels to using Ascend NPU directly.
 
 ## News
 
@@ -210,7 +210,7 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager
     # and install driver.
     sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
     ```
-    If the following messaage appers, firmware is installed successfully.
+    If the following message appears, firmware is installed successfully.
     ```sh
     Firmware package installed successfully!
     ```
Original file line number	Diff line number	Diff line change
`@@ -2399,7 +2399,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex`
`2399`	`2399`	`params.fit_params = false;`
`2400`	`2400`	`} else {`
`2401`	`2401`	`throw std::runtime_error(`
`2402`		`- string_format("error: unkown value for --fit: '%s'\n", value.c_str()));`
	`2402`	`+ string_format("error: unknown value for --fit: '%s'\n", value.c_str()));`
`2403`	`2403`	`}`
`2404`	`2404`	`}`
`2405`	`2405`	`).set_env("LLAMA_ARG_FIT"));`