ggml-org · pwilkin · Apr 9, 2026 · Apr 9, 2026 · Apr 11, 2026 · ngxson
@@ -591,9 +591,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         common_params_handle_model(params.vocoder.model,           params.hf_token, params.offline);
     }
 
-    // model is required (except for server)
+    // model is required (except for server, or when using --endpoint in CLI)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion && params.endpoint.empty()) {
         throw std::invalid_argument("error: --model is required\n");
     }
 
@@ -1380,6 +1380,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.show_timings = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
+    add_opt(common_arg(
+        {"--endpoint"}, "URL",
+        string_format("connect to a running llama-server at URL instead of loading a model locally (e.g. http://localhost:8080)"),
+        [](common_params & params, const std::string & value) {
+            params.endpoint = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_ENDPOINT"));
     add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",

@@ -546,6 +546,10 @@ struct common_params {
 
     bool single_turn       = false; // single turn chat conversation
 
+    // remote server endpoint for CLI (e.g. "http://localhost:8080")
+    // when set, CLI connects to a running server instead of loading a model
+    std::string endpoint   = "";                                                                   // NOLINT
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 

@@ -1,9 +1,11 @@
 set(TARGET llama-cli)
-add_executable(${TARGET} cli.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
+add_executable(${TARGET} cli.cpp cli-backend.cpp)
+target_link_libraries(${TARGET} PRIVATE cpp-httplib PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 include_directories(../server)
+include_directories(../mtmd)
+include_directories(../../vendor)
 
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)