trymirai
diff --git a/‎Cargo.toml‎
Lines changed: 0 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎crates/cli/src/handlers/run.rs‎
Lines changed: 3 additions & 3 deletions b/‎crates/cli/src/handlers/run.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎crates/cli/src/handlers/serve.rs‎
Lines changed: 3 additions & 2 deletions b/‎crates/cli/src/handlers/serve.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎crates/cli/src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎crates/cli/src/lib.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/cli/src/main.rs‎
Lines changed: 22 additions & 8 deletions b/‎crates/cli/src/main.rs‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎crates/cli/src/server/main.rs‎
Lines changed: 6 additions & 2 deletions b/‎crates/cli/src/server/main.rs‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎crates/cli/src/server/state.rs‎
Lines changed: 9 additions & 24 deletions b/‎crates/cli/src/server/state.rs‎
Lines changed: 9 additions & 24 deletions
diff --git a/‎crates/cli/src/speculator_args.rs‎
Lines changed: 99 additions & 0 deletions b/‎crates/cli/src/speculator_args.rs‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎crates/uzu/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎crates/uzu/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/uzu/src/config/language_model.rs‎
Lines changed: 2 additions & 0 deletions b/‎crates/uzu/src/config/language_model.rs‎
Lines changed: 2 additions & 0 deletions
@@ -70,7 +70,6 @@ autocxx = "0.30"
 cxx = "1.0"
 cmake = "0.1"
 autocxx-build = "0.30"
-
 # optimize the build script in debug builds
 [profile.dev.build-override]
 opt-level = 3
 
@@ -12,7 +12,7 @@ use uzu::session::{
     types::{Input, Output},
 };
 
-use crate::server::load_session;
+use crate::{server::load_session, speculator_args::SpeculatorArgs};
 
 fn format_output(output: Output) -> String {
     let stats = &output.stats;
@@ -53,11 +53,11 @@ pub fn handle_run(
     tokens_limit: usize,
     prefill_step_size: Option<usize>,
     seed: Option<u64>,
-    speculator: Option<String>,
     mut message: Option<String>,
     no_thinking: bool,
+    speculator_args: SpeculatorArgs,
 ) {
-    let mut session = load_session(model_path, prefill_step_size, seed, speculator);
+    let mut session = load_session(model_path, prefill_step_size, seed, speculator_args);
 
     let is_model_running = Arc::new(AtomicBool::new(false));
     let is_model_running_for_ctrlc = is_model_running.clone();
 
@@ -1,11 +1,12 @@
 use tokio::runtime::Runtime;
 
-use crate::server::main::run_server;
+use crate::{server::main::run_server, speculator_args::SpeculatorArgs};
 
 pub fn handle_serve(
     model_path: String,
     prefill_step_size: Option<usize>,
+    speculator_args: SpeculatorArgs,
 ) {
     let runtime = Runtime::new().unwrap();
-    runtime.block_on(run_server(model_path, prefill_step_size));
+    runtime.block_on(run_server(model_path, prefill_step_size, speculator_args));
 }
@@ -1,2 +1,3 @@
 pub mod handlers;
 pub mod server;
+pub mod speculator_args;
@@ -1,5 +1,8 @@
 use clap::{CommandFactory, Parser, Subcommand};
-use cli::handlers::{handle_bench, handle_run, handle_serve};
+use cli::{
+    handlers::{handle_bench, handle_run, handle_serve},
+    speculator_args::SpeculatorArgs,
+};
 
 #[derive(Parser)]
 struct Cli {
@@ -15,24 +18,26 @@ enum Commands {
         model_path: String,
         /// Prefill step size
         prefill_step_size: Option<usize>,
-        // Seed
+        /// Seed
         #[arg(long)]
         seed: Option<u64>,
-        // Speculator
-        #[arg(long)]
-        speculator: Option<String>,
         /// Non-interactive mode: run a single message and exit
         #[arg(long, short)]
         message: Option<String>,
         #[arg(long, short)]
+        /// Disable thinking mode
         no_thinking: bool,
+        #[command(flatten)]
+        speculator_args: SpeculatorArgs,
     },
     /// Start a server with the specified model path
     Serve {
         /// Folder with model's files
         model_path: String,
         /// Prefill step size
         prefill_step_size: Option<usize>,
+        #[command(flatten)]
+        speculator_args: SpeculatorArgs,
     },
     /// Run benchmarks for the specified model
     Bench {
@@ -53,17 +58,26 @@ fn main() {
             model_path,
             prefill_step_size,
             seed,
-            speculator,
             message,
             no_thinking,
+            speculator_args,
         }) => {
-            handle_run(model_path, 2048, prefill_step_size, seed, speculator, message, no_thinking);
+            handle_run(
+                model_path,
+                2048,
+                prefill_step_size,
+                seed,
+                message,
+                no_thinking,
+                speculator_args,
+            );
         },
         Some(Commands::Serve {
             model_path,
             prefill_step_size,
+            speculator_args,
         }) => {
-            handle_serve(model_path, prefill_step_size);
+            handle_serve(model_path, prefill_step_size, speculator_args);
         },
         Some(Commands::Bench {
             model_path,
 
@@ -3,7 +3,10 @@ use std::path::PathBuf;
 use log::LevelFilter;
 use rocket::{Config, config::LogLevel, log::private as log, routes};
 
-use crate::server::{SessionState, SessionWrapper, handle_chat_completions, handle_models, load_session};
+use crate::{
+    server::{SessionState, SessionWrapper, handle_chat_completions, handle_models, load_session},
+    speculator_args::SpeculatorArgs,
+};
 
 struct SilentLogger;
 static SILENT_LOGGER: SilentLogger = SilentLogger;
@@ -26,6 +29,7 @@ impl log::Log for SilentLogger {
 pub async fn run_server(
     model_path: String,
     prefill_step_size: Option<usize>,
+    speculator_args: SpeculatorArgs,
 ) {
     // Install the silent logger **before** Rocket initializes its own logger.
     let _ = log::set_logger(&SILENT_LOGGER).map(|_| log::set_max_level(LevelFilter::Off));
@@ -43,7 +47,7 @@ pub async fn run_server(
     println!("🌐 Server will be available at: http://localhost:{}", config.port);
     println!("📝 Endpoints:\n   POST /chat/completions - Chat completions API\n");
 
-    let session = load_session(model_path, prefill_step_size, None, None);
+    let session = load_session(model_path, prefill_step_size, None, speculator_args);
     let state = SessionState {
         model_name,
         session_wrapper: std::sync::Arc::new(SessionWrapper::new(session)),
 
@@ -6,7 +6,7 @@ use std::{
 use console::Style;
 use indicatif::{ProgressBar, ProgressStyle};
 use uzu::{
-    prelude::{SamplingSeed, SpeculatorConfig},
+    prelude::SamplingSeed,
     session::{
         Session,
         config::{DecodingConfig, RunConfig},
@@ -15,6 +15,8 @@ use uzu::{
     },
 };
 
+use crate::speculator_args::SpeculatorArgs;
+
 pub trait RunSession {
     fn run(
         &mut self,
@@ -59,7 +61,7 @@ pub fn load_session(
     model_path: String,
     prefill_step_size: Option<usize>,
     seed: Option<u64>,
-    speculator: Option<String>,
+    speculator_args: SpeculatorArgs,
 ) -> Session {
     let style_bold = Style::new().bold();
 
@@ -71,35 +73,18 @@ pub fn load_session(
     progress_bar.set_style(ProgressStyle::default_spinner().template("{spinner:.green} Loading: {msg}").unwrap());
     progress_bar.set_message(model_name.clone());
 
-    let prefill_step_size_config: PrefillStepSize;
-    if let Some(value) = prefill_step_size {
-        prefill_step_size_config = PrefillStepSize::Custom(value);
-    } else {
-        prefill_step_size_config = PrefillStepSize::Default;
-    }
+    let prefill_step_size_config = match prefill_step_size {
+        Some(value) => PrefillStepSize::Custom(value),
+        None => PrefillStepSize::Default,
+    };
 
     let decoding_config = DecodingConfig::default()
         .with_prefill_step_size(prefill_step_size_config)
         .with_sampling_seed(match seed {
             Some(seed) => SamplingSeed::Custom(seed),
             None => SamplingSeed::Default,
         })
-        .with_speculator_config(match speculator {
-            Some(speculator) => {
-                let (speculator, number_of_speculated_tokens) =
-                    speculator.split_once(':').unwrap_or((&speculator, "1"));
-
-                let number_of_speculated_tokens = number_of_speculated_tokens.parse().unwrap();
-
-                let speculator = Arc::new(uzu::speculators::ngram_speculator::NGramSpeculator::load(speculator));
-
-                SpeculatorConfig {
-                    number_of_speculated_tokens,
-                    speculator,
-                }
-            },
-            None => SpeculatorConfig::default(),
-        });
+        .with_speculator_config(speculator_args.build_speculator_config(&model_path_buf));
     let session = Session::new(model_path_buf, decoding_config).expect("Failed to create session");
 
     progress_bar.set_style(ProgressStyle::default_spinner().template("Loaded: {msg}").unwrap());
 
@@ -0,0 +1,99 @@
+
+use std::{path::Path, sync::Arc};
+use clap::{Args, ValueEnum};
+use uzu::{
+    backends::metal::Metal,
+    prelude::{NGramSpeculator, NeuralSpeculator, SpeculatorConfig},
+};
+
+#[derive(ValueEnum, Debug, Clone)]
+pub enum SpeculatorType {
+    Pard,
+    Ngram,
+}
+
+#[derive(Args, Debug, Clone)]
+pub struct SpeculatorArgs {
+    #[arg(long, value_enum)]
+    /// Type of the speculator to use. If not specified, speculation is disabled.
+    pub speculator_type: Option<SpeculatorType>,
+    #[arg(long)]
+    /// Path to the speculator model.
+    pub speculator_path: Option<String>,
+    #[arg(long = "speculator-tokens", default_value_t = 1)]
+    /// Number of tokens to speculate.
+    pub speculated_tokens: usize,
+}
+
+impl SpeculatorArgs {
+    pub fn build_speculator_config(
+        &self,
+        model_path: &Path,
+    ) -> SpeculatorConfig {
+        let Some(spec_type) = &self.speculator_type else {
+            return SpeculatorConfig::default();
+        };
+
+        let n = self.speculated_tokens;
+
+        match spec_type {
+            SpeculatorType::Pard => {
+                let path_str = self
+                    .speculator_path
+                    .as_deref()
+                    .expect("--speculator-path is required when --speculator-type is pard");
+                let speculator = NeuralSpeculator::<Metal>::new(Path::new(path_str), n, 8)
+                    .expect("Failed to load PARD draft model");
+                SpeculatorConfig::new(n + 1, Arc::new(speculator))
+            },
+            SpeculatorType::Ngram => {
+                let ngram_path = match self.speculator_path.as_deref() {
+                    Some(path) => path,
+                    None => &self.resolve_ngram_path(model_path).to_string_lossy().into_owned()
+                };
+                let speculator = NGramSpeculator::load(ngram_path).expect("Failed to load NGram speculator");
+                SpeculatorConfig::new(n, Arc::new(speculator))
+            },
+        }
+    }
+
+    fn resolve_ngram_path(
+        &self,
+        model_path: &Path,
+    ) -> std::path::PathBuf {
+        if let Some(explicit) = self.speculator_path.as_deref() {
+            return std::path::PathBuf::from(explicit);
+        }
+
+        let speculators_dir = model_path.join("speculators");
+        let mut found: Vec<std::path::PathBuf> = Vec::new();
+
+        if let Ok(entries) = std::fs::read_dir(&speculators_dir) {
+            for entry in entries.flatten() {
+                let candidate = entry.path().join("model.bin");
+                if candidate.exists() {
+                    found.push(candidate);
+                }
+            }
+        }
+
+        if found.is_empty() {
+            eprintln!(
+                "error: no ngram speculator found in {}\n\
+                 Looked for: {}/*/model.bin\n\
+                 Specify a path explicitly with --speculator-path <path>",
+                speculators_dir.display(),
+                speculators_dir.display(),
+            );
+            std::process::exit(1);
+        }
+
+        if let Some(chat) =
+            found.iter().find(|p| p.parent().and_then(|d| d.file_name()).map(|n| n == "chat").unwrap_or(false))
+        {
+            return chat.clone();
+        }
+
+        found.remove(0)
+    }
+}
@@ -46,6 +46,7 @@ is_close.workspace = true
 serde_json5.workspace = true
 schemars.workspace = true
 mpsgraph.workspace = true
+tempfile.workspace = true
 
 [build-dependencies]
 anyhow.workspace = true
 
@@ -19,6 +19,8 @@ pub struct InnerModelConfig {
     pub embedding_config: EmbeddingConfig,
     pub transformer_config: TransformerConfig,
     pub vocab_size: usize,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub pard_token: Option<u64>,
 }
 
 impl InnerModelConfig {
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`pub mod handlers;`
`2`	`2`	`pub mod server;`
	`3`	`+pub mod speculator_args;`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ pub struct InnerModelConfig {`
`19`	`19`	`pub embedding_config: EmbeddingConfig,`
`20`	`20`	`pub transformer_config: TransformerConfig,`
`21`	`21`	`pub vocab_size: usize,`
	`22`	`+ #[serde(default, skip_serializing_if = "Option::is_none")]`
	`23`	`+ pub pard_token: Option<u64>,`
`22`	`24`	`}`
`23`	`25`
`24`	`26`	`impl InnerModelConfig {`