Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1231 commits
Select commit Hold shift + click to select a range
99c53d6
webui: Add a "Continue" Action for Assistant Message (#16971)
allozaur Nov 19, 2025
2eba631
vulkan: Add copy_transpose shader (#17371)
jeffbolznv Nov 19, 2025
1fa4551
vulkan: support larger argsort (#17313)
jeffbolznv Nov 19, 2025
7d77f07
vulkan: implement ADD1, ARANGE, FILL, SOFTPLUS, STEP, ROUND, CEIL, FL…
giuseppe Nov 19, 2025
5be353e
ggml-cpu:add RISC-V RVV (Zvfh) optimization for FP16 vector scaling (…
ixgbe Nov 20, 2025
3ae282a
kleidiai: fix zero-size array declaration (#17240)
sudhiarm Nov 20, 2025
79bb743
ggml : remove useless and error-prone variadic macros (#17399)
angt Nov 20, 2025
a7784a8
DGX Spark: UMA support (#17368)
sfudally-nvidia Nov 20, 2025
845f200
ggml : Fix transposed SOLVE_TRI result (#17323)
pwilkin Nov 20, 2025
5088b43
convert : fix TypeError when loading base model remotely in convert_l…
o7si Nov 20, 2025
196f508
common : more accurate sampling timing (#17382)
ggerganov Nov 20, 2025
1d321e5
metal : fix compile on macos 11 (whisper/3533)
smilingpoplar Nov 20, 2025
2286a36
sync : ggml
ggerganov Nov 20, 2025
92c0b38
grammar : fix integer overflow (#17381)
pwilkin Nov 20, 2025
4c91f26
Improved file naming & structure for UI components (#17405)
allozaur Nov 20, 2025
054a45c
grammar: fix regression caused by #17381 (#17412)
ngxson Nov 20, 2025
dd0f321
readme : add Unsloth exporting to GGUF in tools (#17411)
danielhanchen Nov 20, 2025
21d31e0
ggml-hexagon: fix swiglu failure at `test-backend-ops` (#17344)
chraac Nov 20, 2025
2370665
CANN: Refactor `evaluate_and_capture_cann_graph` (#17333)
rauletorresc Nov 21, 2025
f1ffbba
vulkan: disable async for older Intel devices (#17369)
jeffbolznv Nov 21, 2025
9cc4080
ci : start using OpenSSL (#17235)
angt Nov 21, 2025
28175f8
cmake : add option to build and link BoringSSL (#17205)
angt Nov 21, 2025
23bc779
model : detect GigaChat3-10-A1.8B as deepseek lite (#17420)
ubergarm Nov 21, 2025
8e9ddba
opencl: refine condition for kqv mm (#17392)
lhez Nov 21, 2025
028f93e
HIP: RDNA4 tensor core support for MMF (#17077)
zhang-hui-yulo Nov 21, 2025
3f3a4fb
Revive MUL_MAT_ID to perf testing (#17397)
rillomas Nov 22, 2025
4949ac0
ci : switch to BoringSSL on Server workflow (#17441)
angt Nov 22, 2025
54d83bb
vulkan: remove a couple unnecessary switches (#17419)
jeffbolznv Nov 23, 2025
bc809e9
vulkan: Update docker image to Ubuntu 26.04 to enable glslc features …
ericcurtin Nov 23, 2025
96ac5a2
cuda : support non-contiguous i32 to i32 copy (#17326)
CISC Nov 23, 2025
0c7220d
webui: minor settings reorganization and add disable autoscroll optio…
ServeurpersoCom Nov 23, 2025
d5bc1ad
ggml-hexagon: add `hex_supported_buffer` for better buffer supported …
chraac Nov 23, 2025
fcb0138
ggml-hexagon: Initial Hexagon v68/v69 support (#17394)
mediouni-m Nov 24, 2025
01ad35e
CANN: Define `cann_graph_update_required` before macro (#17434)
rauletorresc Nov 24, 2025
923ae3c
hexagon: add support for ROPE_NEOX (#17458)
max-krasnyansky Nov 24, 2025
4902eeb
models : Added support for RND1 Diffusion Language Model (#17433)
wp4032 Nov 24, 2025
5f55c38
ggml: add RISC-V cpu-feats (#17461)
ixgbe Nov 24, 2025
dbb852b
ggml-cpu: arm64: q4_K repack gemm and gemv implementations (i8mm) (#1…
Alcpz Nov 24, 2025
697edfe
ggml : remove dirty flag from version string (ggml/1391)
danbev Nov 24, 2025
2d50b9d
sync : ggml
ggerganov Nov 24, 2025
6ab8eac
examples : add -kvu to batched usage example [no ci] (#17469)
danbev Nov 24, 2025
b8372ee
server: split server.cpp code into server/common/task/queue (#17362)
ngxson Nov 24, 2025
b61de2b
convert : allow quantizing lora again (#17453)
CISC Nov 24, 2025
0543f92
HIP: WMMA-MMQ kernels for RDNA 4 (#17156)
jiachengjason Nov 24, 2025
134e694
llama : skip output reordering for single token batches (#17466)
danbev Nov 24, 2025
3d07caa
vulkan: more FA details in vk_perf_logger (#17443)
jeffbolznv Nov 24, 2025
877566d
llama: introduce support for model-embedded sampling parameters (#17120)
taronaeo Nov 25, 2025
d414db0
vulkan: Use fewer rows for scalar FA when HS is not a multiple of 16 …
jeffbolznv Nov 25, 2025
b1846f1
webui: add rehype plugin to restore HTML in Markdown table cells (#17…
ServeurpersoCom Nov 25, 2025
064c90d
CANN: supports out_prod operator for F32 and F16 (#17406)
TianHao324 Nov 25, 2025
55ab25c
codeowners : remove slaren (#17492)
slaren Nov 25, 2025
05872ac
convert : fix big-endian conversion (#17431)
AlekseiNikiforovIBM Nov 25, 2025
583cb83
ggml : add ggml_top_k (#17365)
ggerganov Nov 25, 2025
b3b03a7
vulkan: Implement GGML_OP_CUMSUM (#17479)
jeffbolznv Nov 26, 2025
f3a848a
chore: upgrade cpp-httplib from v0.27.0 to v0.28.0 (#17513)
o7si Nov 26, 2025
eeb5605
CANN: Add MROPE and IMROPE support (#17401)
hipudding Nov 26, 2025
3e18dba
HIP: Patch failed testcase in WMMA-MMQ kernels for RDNA 4 (#17502)
jiachengjason Nov 26, 2025
e6923ca
ggml : fix ARM feature verification (#17519)
angt Nov 26, 2025
2336cc4
cmake : use EXCLUDE_FROM_ALL to avoid patch-boringssl.cmake (#17520)
angt Nov 26, 2025
6ab4e50
ggml-cpu : add RISC-V Zvfh impl for ggml_vec_mad_f16 (#17448)
xctan Nov 26, 2025
879d673
vulkan: Implement top-k (#17418)
jeffbolznv Nov 26, 2025
eec1e33
vulkan: allow graph_optimize for prompt processing workloads (#17475)
jeffbolznv Nov 26, 2025
1d594c2
clip: (minicpmv) fix resampler kq_scale (#17516)
hNSBQZ Nov 26, 2025
5449367
Fix chunks being too small with small matrix sizes (#17526)
Alcpz Nov 26, 2025
7cba58b
opencl: add sqr, sqrt, mean and ssm_conv (#17476)
lhez Nov 26, 2025
e509411
server: enable jinja by default, update docs (#17524)
ngxson Nov 27, 2025
142df17
vulkan: use a fixed 1KB buffer for the add_rms_fusion opt (#17514)
jeffbolznv Nov 27, 2025
b78db3b
vulkan : move contiguous checks to device_supports_op (#17490)
Acly Nov 27, 2025
4fcd87c
gguf-py : skip endian-conversion of MXFP4 data (#17523)
AlekseiNikiforovIBM Nov 27, 2025
d21a76a
devops: Add build-essential to Ubuntu 26.04 image (#17531)
ericcurtin Nov 27, 2025
cd8370b
ggml-cpu: aarm64: q4_K repack gemm and gemv implementations (dotprod …
Alcpz Nov 27, 2025
909072a
cuda : fix UMA detection on discrete GPUs. (#17537)
matt23654 Nov 27, 2025
6783b11
models : fix LFM2 tensors (#17548)
ggerganov Nov 27, 2025
c386114
arch : add description about LLM_TENSOR_INFOS (#17550)
ggerganov Nov 27, 2025
4abef75
vulkan: Implement SOLVE_TRI (#17486)
jeffbolznv Nov 27, 2025
efaaccd
refactor pad_reflect_1d to make the UT case pass (#17204)
NeoZhangJianyu Nov 28, 2025
cd0e3a7
SOLVE_TRI CUDA kernel for small matrices (#17457)
pwilkin Nov 28, 2025
6bca76f
HIP: enable mul_mat_f for RDNA4 (#17437)
zhang-hui-yulo Nov 28, 2025
15d2b46
rpc : cache and reuse compute graphs (#15405)
rgerganov Nov 28, 2025
35cf888
vulkan: Implement GGML_OP_TRI (#17503)
jeffbolznv Nov 28, 2025
73955f7
CUDA: no FP16 arithmetic for vector FA kernel (#17558)
JohannesGaessler Nov 28, 2025
ff55414
model : Qwen3 Next (#16095)
pwilkin Nov 28, 2025
ddf9f94
server : add Anthropic Messages API support (#17570)
noname22 Nov 28, 2025
2e7ef98
ggml-cuda: add stricter checking for fusion (#17568)
am17an Nov 28, 2025
c6f7a42
[MUSA] enable fp16/fast_fp16/bf16_mma on PH1 (#17551)
yeahdongcn Nov 28, 2025
e072b20
ggml : add GGML_SCHED_NO_REALLOC option to disable reallocations in g…
slaren Nov 28, 2025
3ce7a65
server: fix: /metrics endpoint returning JSON-escaped Prometheus form…
o7si Nov 28, 2025
03914c7
common : move all common_chat_parse_* to chat-parser.cpp. (#17481)
dranger003 Nov 28, 2025
d82b7a7
gguf-py : fix passing non-native endian tensors (editor-gui and new-m…
AlekseiNikiforovIBM Nov 28, 2025
59d8d4e
vulkan: improve topk perf for large k, fix overflow in unit tests (#1…
jeffbolznv Nov 29, 2025
47a268e
Vulkan: MMVQ Integer Dot K-Quant and MUL_MAT_ID support (#16900)
0cc4m Nov 29, 2025
f698a79
ggml: replace hwcap with riscv_hwprobe for RVV detection (#17567)
ixgbe Nov 29, 2025
7d2add5
sycl : support to malloc memory on device more than 4GB, update the d…
arthw Nov 29, 2025
0874693
common : fix json schema with '\' in literals (#17307)
i-v-s Nov 29, 2025
8c32d9d
server: explicitly set the function name in lambda (#17538)
haiyuewa Nov 29, 2025
ab49f09
server: move server-context to its own cpp|h (#17595)
ngxson Nov 29, 2025
385c3da
vulkan : fix FA mask load with bounds check (coopmat2) (#17606)
Acly Nov 30, 2025
00425e2
cuda : add error checking for cudaMemcpyAsync in argsort (#17599)
Mahekk357 Nov 30, 2025
c7af376
CUDA: add stream-based concurrency (#16991)
am17an Nov 30, 2025
5a6241f
common: update env var name (#17588)
ddh0 Nov 30, 2025
fa04659
ggml: fix: macOS build with `-DGGML_BACKEND_DL=ON` (#17581)
giladgd Nov 30, 2025
def5404
common: add LLAMA_LOG_FILE env var (#17609)
taronaeo Nov 30, 2025
beb1f0c
common : throttle download progress output to reduce IO flush (#17427)
angt Nov 30, 2025
3c136b2
cli: add migration warning (#17620)
ngxson Nov 30, 2025
7f8ef50
clip: fix nb calculation for qwen3-vl (#17594)
ngxson Nov 30, 2025
2ba7195
model: LFM2-VL fixes (#17577)
tdakhran Nov 30, 2025
0a4aeb9
cmake : add option to build and link LibreSSL (#17552)
angt Nov 30, 2025
ff90508
contributing: update guidelines for AI-generated code (#17625)
ngxson Nov 30, 2025
6eea666
llama-graph: avoid expand_forward for fusion (#17633)
am17an Dec 1, 2025
90c72a6
ggml : extend the GGML_SCHED_NO_REALLOC debug logic of the scheduler …
ggerganov Dec 1, 2025
649495c
metal : add FA head size 48 (#17619)
ggerganov Dec 1, 2025
cd3c118
model: support Ministral3 (#17644)
ngxson Dec 1, 2025
7733409
common: improve verbosity level definitions (#17630)
ngxson Dec 1, 2025
ec18edf
server: introduce API for serving / loading / unloading multiple mode…
ngxson Dec 1, 2025
00c361f
fix: llama arch implementation (#17665)
giladgd Dec 1, 2025
ecf74a8
mtmd: add mtmd_context_params::warmup option (#17652)
ngxson Dec 1, 2025
9810cb8
ops.md: update vulkan support (#17661)
jeffbolznv Dec 1, 2025
746f9ee
Override SSM_A op for Qwen3 Next to reduce splits (#17587)
pwilkin Dec 1, 2025
98bd9ab
enhance argsort for UT (#17573)
NeoZhangJianyu Dec 2, 2025
7b6d745
release: fix duplicate libs, store symbolic links (#17299)
taronaeo Dec 2, 2025
ed32089
ggml-cuda: reorder only relevant nodes (#17639)
am17an Dec 2, 2025
cee92af
Add context info to server error (#17663)
allozaur Dec 2, 2025
ab6726e
ggml : add fallback definition for HWCAP2_SVE2 (#17683)
angt Dec 2, 2025
4574f29
ci : skip winget update when not in ggml-org (#17465)
angt Dec 2, 2025
682e665
server: explicitly set exec path when create new instance (#17669)
ngxson Dec 2, 2025
fd3abe8
server: fixing naming conflict res_error in server-models.cpp (#17679)
w169q169 Dec 2, 2025
5d6bd84
server: remove default "gpt-3.5-turbo" model name (#17668)
ngxson Dec 2, 2025
2c453c6
convert: add error message for mistral3 quantized weight (#17686)
ngxson Dec 2, 2025
f3a9674
llama : fix signed comparison warning on FreeBSD (#17497)
angt Dec 2, 2025
b9a3771
codeowners : remove ericcurtin (#17658)
ericcurtin Dec 2, 2025
7f3a72a
ggml : remove redundant n_copies check when setting input/output (#17…
danbev Dec 2, 2025
a2b0fe8
CANN: Disable Ger operator of OUT_PROD on 310p device (#17563)
TianHao324 Dec 2, 2025
e148380
ggml : use svcntb() for SVE vector length detection (#17474)
angt Dec 2, 2025
c4357dc
Server: Change Invalid Schema from Server Error (500) to User Error (…
chadvoegele Dec 2, 2025
e251e5e
cmake : add utf8 compilation options for msvc (#17682)
xiaobing318 Dec 2, 2025
61bde8e
vulkan: Reduce temporary memory usage for TOP_K (#17623)
jeffbolznv Dec 2, 2025
4eba8d9
ci : RVV1.0 builds with tests (#16682)
alitariq4589 Dec 2, 2025
a96283a
mtmd: fix --no-warmup (#17695)
ngxson Dec 2, 2025
13628d8
server: add --media-path for local media files (#17697)
ngxson Dec 2, 2025
16cc3c6
build: document how to compile with Vulkan using Debian/Ubuntu packag…
socram8888 Dec 3, 2025
37adc9c
ggml, llama : use defaulted constructors/destructors (#17649)
GermanAizek Dec 3, 2025
b3e3060
ci : move release details to the top visible by default (#17719)
CISC Dec 3, 2025
7ca5991
ggml webgpu: add support for emscripten builds (#17184)
reeselevine Dec 3, 2025
5ceed62
server: fix duplicate HTTP headers in multiple models mode (#17698)
ServeurpersoCom Dec 3, 2025
0a8026e
common : introduce composable PEG parser combinators for chat parsing…
aldehir Dec 3, 2025
7feb0a1
ci : remove the build of openeuler-cann in release (#17724)
xuedinge233 Dec 3, 2025
3d94e96
metal : fix data race in pipeline library (#17731)
ggerganov Dec 3, 2025
083e18b
cmake: explicitly link against crypt32 on non-MSVC Windows builds (#1…
angt Dec 3, 2025
1257491
server : fix bad fmt, size() is a size_type (#17735)
angt Dec 3, 2025
e7c2cf1
server: add router multi-model tests (#17704) (#17722)
ServeurpersoCom Dec 3, 2025
190c483
chat : reserve memory in compute_diffs and improve naming (#17729)
ggerganov Dec 3, 2025
2e1c9cd
CUDA: generalized (mma) FA, add Volta support (#17505)
JohannesGaessler Dec 3, 2025
41c5e02
webui: Fix zero pasteLongTextToFileLen to disable conversion being ov…
awasisto Dec 3, 2025
e9f9483
Use OpenAI-compatible `/v1/models` endpoint by default (#17689)
allozaur Dec 3, 2025
424c579
convert : support latest mistral-common (fix conversion with --mistra…
SmartestWashingMachine Dec 3, 2025
c6d1a00
Add a couple of file types to the text section (#17670)
pwilkin Dec 3, 2025
dea9ba2
ggml-cpu: remove duplicate conditional check 'iid' (#17650)
GermanAizek Dec 3, 2025
d8b5cdc
build: enable parallel builds in msbuild using MTT (#17708)
jeffbolznv Dec 4, 2025
ef75a89
build : move _WIN32_WINNT definition to headers (#17736)
angt Dec 4, 2025
a67ef0f
llama : fix sanity checks during quantization (#17721)
ggerganov Dec 4, 2025
0d13248
metal : use params per pipeline instance (#17739)
ggerganov Dec 4, 2025
83c1171
common: use native MultiByteToWideChar (#17738)
angt Dec 4, 2025
7dba049
ci : disable ggml-ci-x64-amd-* (#17753)
CISC Dec 4, 2025
2a73f81
cmake : simplify build info detection using standard variables (#17423)
angt Dec 4, 2025
3659aa2
convert: use existing local chat_template if mistral-format model has…
SmartestWashingMachine Dec 4, 2025
87a2084
ggml-cpu : remove asserts always evaluating to false (#17728)
Alcpz Dec 4, 2025
bd4ef13
common : skip model validation when --help is requested (#17755)
danbev Dec 4, 2025
817d743
examples : add missing code block end marker [no ci] (#17756)
danbev Dec 4, 2025
c4c10bf
server: move msg diffs tracking to HTTP thread (#17740)
ngxson Dec 4, 2025
9d02299
server: strip content-length header on proxy (#17734)
ngxson Dec 4, 2025
bde188d
metal: TRI, FILL, EXPM1, SOFTPLUS (#16623)
gabe-l-hart Dec 4, 2025
96fe9ba
Add support for CUMSUM and TRI for CUDA. (#17584)
pwilkin Dec 4, 2025
3143a75
docs : update ops.md (Metal, BLAS) (#17768)
gabe-l-hart Dec 4, 2025
03d9a77
ci : transform release binary root dir in tar to llama-bXXXX (#17773)
CISC Dec 5, 2025
668ed76
HIP: enable WMMA-MMQ INT kernels for RDNA 3 (#17576)
jiachengjason Dec 5, 2025
e95d0bc
CUDA: fix FA VKQ accumulator overflow (#17746)
JohannesGaessler Dec 5, 2025
6648989
Add pwilkin to CODEOWNERS for chat files (#17789)
pwilkin Dec 5, 2025
3a0d105
Q4/Q8 Tiled Gemm Optimization. (#16999)
shalinib-ibm Dec 5, 2025
a6cfc21
ci : fix winget workflow (#17790)
angt Dec 5, 2025
1be9783
fix: prevent segfault in tokenizer on highly repetitive input (#17786)
ServeurpersoCom Dec 5, 2025
6016d0b
HIP : fix RDNA4 build (#17792)
JohannesGaessler Dec 5, 2025
c41bde6
metal : add residency sets keep-alive heartbeat (#17766)
ggerganov Dec 5, 2025
8160b38
rpc : fix alloc size logic (#17116)
ggerganov Dec 5, 2025
93bb926
vulkan: set all memory allocations to high priority (#17624)
jeffbolznv Dec 5, 2025
6ab0d64
vulkan: enable mmvq for q2_k on NVIDIA (#17675)
jeffbolznv Dec 5, 2025
fd57b24
ggml webgpu: unary op suppport, code refactoring, ops support (#17764)
reeselevine Dec 5, 2025
e15cd06
vulkan : support conv-2d with large output size (#17685)
Acly Dec 5, 2025
a0f3897
vulkan: fix top_k bug when there are ties in the input (#17659)
jeffbolznv Dec 5, 2025
933414c
vulkan: add more num_blocks instantiations in rms_norm (#17701)
jeffbolznv Dec 5, 2025
d8c0a7b
vulkan: Fix mismatch in TOPK_MOE unit test (#17541)
rillomas Dec 6, 2025
67788f6
vulkan: Replace deprecated VK_EXT_validation_features (#17637)
rillomas Dec 6, 2025
8ce774a
metal : fix build(#17799)
ggerganov Dec 6, 2025
8e5f498
contrib : stale PRs (#17803)
ggerganov Dec 6, 2025
c6c5e85
vulkan: support solve_tri with larger N/K values (#17781)
jeffbolznv Dec 6, 2025
dbc15a7
convert: support Mistral 3 Large MoE (#17730)
ngxson Dec 6, 2025
2960eb2
vulkan: Use one row per workgroup for f32 mmv (#17711)
jeffbolznv Dec 6, 2025
444f00b
llama : remove quantization sanity check (#17788)
danbev Dec 6, 2025
7b43f55
ggml : improve error handling for search path existence checks (#17653)
flyinskyin2013 Dec 6, 2025
21f24f2
webui: Per-conversation system message with UI displaying, edition & …
allozaur Dec 6, 2025
e31b5c5
webui: Fix context available value in Multi-model Router mode (#17804)
allozaur Dec 6, 2025
a28e3c7
webui: Stop generation from chat sidebar (#17806)
allozaur Dec 6, 2025
f334b79
HIP: fix RDNA3 FP16/BF16 matrix multiplication (#17817)
JohannesGaessler Dec 6, 2025
09c7c50
ggml : add circular tiling support to pad, for Vulkan, CUDA, and CPU …
Phylliida Dec 6, 2025
c42712b
server: support multiple generations from one prompt (OAI "n" option)…
ngxson Dec 6, 2025
017761d
ggml-zendnn : add ZenDNN backend for AMD CPUs (#17690)
z-vishal Dec 6, 2025
db97837
vulkan: perf_logger improvements (#17672)
jeffbolznv Dec 6, 2025
d9e03db
sycl: add missing BF16 conversion support for Intel oneAPI (#17780)
yingying0906 Dec 7, 2025
2257758
common : change --color to accept on/off/auto, default to auto (#17827)
CISC Dec 7, 2025
0a540f9
ci : add windows-cuda 13.1 release (#17839)
CISC Dec 7, 2025
08f9d3c
Vulkan: improve mul_mat_vec_iq1_m (#16907)
lovedheart Dec 7, 2025
4d37262
model: add llama 4 scaling for mistral-large (deepseek arch) (#17744)
ngxson Dec 7, 2025
79d6189
ggml-cpu: add ggml_thread_cpu_relax with Zihintpause support (#17784)
ixgbe Dec 8, 2025
5814b4d
cuda: optimize SOLVE_TRI using registers and FMAF (#17703)
wsbagnsv1 Dec 8, 2025
2bc9693
server : make cache_reuse configurable per request (#17858)
ggerganov Dec 8, 2025
37a4f63
server : add development documentation (#17760)
ngxson Dec 8, 2025
51e0c2d
cuda : add FILL op support (#17851)
JayZenith Dec 8, 2025
636fc17
Fix Kimi-K2 tool-call parsing issues (#17376)
hksdpc255 Dec 8, 2025
e4e9c43
Make graph_max_nodes vary by ubatch size (#17794)
pwilkin Dec 8, 2025
f896d2c
server: improve speed of speculative decoding (#17808)
ngxson Dec 8, 2025
68522c6
ci : support bfloat16 SYCL release package (#17855)
arthw Dec 8, 2025
951520d
server: delegate result_state creation to server_task (#17835)
ngxson Dec 8, 2025
2fa51c1
model-conversion : add token ids to prompt token output [no ci] (#17863)
danbev Dec 8, 2025
c8554b6
graph : use fill instead of scale_bias in grouped expert selection (#…
CISC Dec 8, 2025
1d2a1ab
model : support Rnj-1 (#17811)
philip-essential Dec 9, 2025
e39502e
llama : add token matching support to llama-grammar (#17816)
aldehir Dec 9, 2025
0cdce38
CUDA: fix FP16 overflow in tile FA kernel (#17875)
JohannesGaessler Dec 9, 2025
ca709e4
CANN: add support for partial RoPE and Vision mode (#17543)
noemotiovon Dec 9, 2025
4e842d5
console: allow using arrow left/right, home/end keys and history mode…
ngxson Dec 9, 2025
42b12b5
model : nit, DeepSeek V1 MoE is 16B and GigaChat is 20B (#12652)
CISC Dec 9, 2025
63908b6
cmake: fix Mach-O current version number (#17877)
Rhys-T Dec 9, 2025
86a3f0f
ggml : allow fill node alloc inplace (#17870)
CISC Dec 9, 2025
6b82eb7
metal : print node names for debugging (#17882)
ggerganov Dec 9, 2025
02e409a
ggml : Provide macos-specific backtrace printing to avoid terminal de…
gabe-l-hart Dec 9, 2025
48f4756
docs: clarify that CPU support should be first (#17886)
JohannesGaessler Dec 9, 2025
b635092
Add DIAG for CUDA (#17873)
pwilkin Dec 9, 2025
086a63e
metal: SSM kernel improvements (#17876)
gabe-l-hart Dec 9, 2025
6339185
docs : update cpu and cuda ops (#17890)
CISC Dec 9, 2025
2fbe3b7
common : add parser for ministral/mistral large 3/devstral 2 (#17713)
aldehir Dec 9, 2025
2e9eab8
fix softmax for iGPU (#17838)
NeoZhangJianyu Dec 10, 2025
9e79b01
convert: allow using quantized Mistral weight (#17889)
ngxson Dec 10, 2025
17f7f4b
CUDA: fix unpadded strides in MMA FA kernel (#17891)
JohannesGaessler Dec 10, 2025
2d2e103
docs : update opencl ops (#17904)
lhez Dec 10, 2025
b677721
model : Qwen3-Next-80B-A3B has 48 layers (#17898)
EZForever Dec 10, 2025
6c21317
cli: new CLI experience (#17824)
ngxson Dec 10, 2025
4df6e85
cuda : add missing support check for xielu (#17895)
CISC Dec 10, 2025
edf82bd
Squashed commit of the following:
SamuelOliveirads Dec 10, 2025
0525086
speculative: optimize graph reuse for GLM-4.5
SamuelOliveirads Dec 10, 2025
09eab12
glm4: add MTP weight fallback for GLM-4.6 compatibility
SamuelOliveirads Dec 11, 2025
a88ef35
glm-moe: allow skipping MTP tensor loading to save VRAM
SamuelOliveirads Dec 19, 2025
7d782ab
common: simplify speculative sampling to greedy-only for performance
SamuelOliveirads Dec 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 8 additions & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,14 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
Expand Down
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Checks: >
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
-performance-enum-size,
portability-*,
-portability-simd-intrinsics,
misc-*,
Expand Down
13 changes: 6 additions & 7 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,15 @@
# ==============================================================================

# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11

# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build

# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
Expand All @@ -36,20 +34,21 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# For brevity, only core variables are listed here. You can paste the original ENV list here.

# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
# Use the passed CHIP_TYPE argument and add general build options
ARG CHIP_TYPE
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
-DSOC_TYPE=ascend${CHIP_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)

# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
Expand Down
22 changes: 0 additions & 22 deletions .devops/cloud-v-pipeline

This file was deleted.

8 changes: 2 additions & 6 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,23 @@ FROM ubuntu:$UBUNTU_VERSION AS build

ARG TARGETARCH

ARG GGML_CPU_ARM_ARCH=armv8-a

RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev

WORKDIR /app

COPY . .

RUN if [ "$TARGETARCH" = "amd64" ]; then \
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi && \
cmake --build build -j $(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
4 changes: 2 additions & 2 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -61,7 +61,7 @@ RUN apt-get update \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand Down
8 changes: 4 additions & 4 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

## Build Image

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
Expand All @@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand All @@ -31,7 +31,7 @@ RUN mkdir -p /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
Expand Down
4 changes: 2 additions & 2 deletions .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.2.0
ARG MUSA_VERSION=rc4.3.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

Expand Down Expand Up @@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
6 changes: 2 additions & 4 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

# It's necessary to consistently use backendStdenv when building with CUDA support,
Expand Down Expand Up @@ -128,10 +129,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
};

postPatch = ''
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';

# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
Expand Down Expand Up @@ -179,6 +176,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
(cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
(cmakeBool "GGML_RPC" useRpc)
]
++ optionals useCuda [
(
Expand Down
29 changes: 15 additions & 14 deletions .devops/rocm.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=24.04

# This needs to generally match the container host's environment.
ARG ROCM_VERSION=6.4
ARG AMDGPU_VERSION=6.4
ARG ROCM_VERSION=7.0
ARG AMDGPU_VERSION=7.0

# Target the CUDA build image
# Target the ROCm build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

### Build image
Expand All @@ -13,18 +13,14 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html

ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
#ARG ROCM_DOCKER_ARCH=gfx1100
ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
#ARG ROCM_DOCKER_ARCH='gfx1151'

# Set nvcc architectured
# Set ROCm architectures
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
# ENV CC=/opt/rocm/llvm/bin/clang
# ENV CXX=/opt/rocm/llvm/bin/clang++

RUN apt-get update \
&& apt-get install -y \
Expand All @@ -40,11 +36,16 @@ WORKDIR /app
COPY . .

RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
cmake -S . -B build \
-DGGML_HIP=ON \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
-DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
&& cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib \
&& find build -name "*.so" -exec cp {} /app/lib \;
&& find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
126 changes: 126 additions & 0 deletions .devops/s390x.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
ARG GCC_VERSION=15.2.0
ARG UBUNTU_VERSION=24.04

### Build Llama.cpp stage
FROM gcc:${GCC_VERSION} AS build

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt upgrade -y && \
apt install -y --no-install-recommends \
git cmake ccache ninja-build \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
libopenblas-dev libcurl4-openssl-dev && \
rm -rf /var/lib/apt/lists/*

WORKDIR /app
COPY . .

RUN --mount=type=cache,target=/root/.ccache \
--mount=type=cache,target=/app/build \
cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_NATIVE=OFF \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS && \
cmake --build build --config Release -j $(nproc) && \
cmake --install build --prefix /opt/llama.cpp

COPY *.py /opt/llama.cpp/bin
COPY .devops/tools.sh /opt/llama.cpp/bin

COPY gguf-py /opt/llama.cpp/gguf-py
COPY requirements.txt /opt/llama.cpp/gguf-py
COPY requirements /opt/llama.cpp/gguf-py/requirements


### Collect all llama.cpp binaries, libraries and distro libraries
FROM scratch AS collector

# Copy llama.cpp binaries and libraries
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py


### Base image
FROM ubuntu:${UBUNTU_VERSION} AS base

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt install -y --no-install-recommends \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
# See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
curl libgomp1 libopenblas-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

# Copy llama.cpp libraries
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu


### Full
FROM base AS full

ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /app

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt install -y \
git cmake libjpeg-dev \
python3 python3-pip python3-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

RUN curl https://sh.rustup.rs -sSf | bash -s -- -y

COPY --from=collector /llama.cpp/bin /app
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py

RUN pip install --no-cache-dir --break-system-packages \
-r /app/gguf-py/requirements.txt

ENTRYPOINT [ "/app/tools.sh" ]


### CLI Only
FROM base AS light

WORKDIR /llama.cpp/bin

# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]


### Server
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

WORKDIR /llama.cpp/bin

# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

EXPOSE 8080

ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
Loading