Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 9bd5af3

Browse files
committed
Merge remote-tracking branch 'main' into feat/1635/decouple-nitro-inference-engine-into-a-library
2 parents c6a2656 + 47a6cac commit 9bd5af3

File tree

15 files changed

+309
-290
lines changed

15 files changed

+309
-290
lines changed

.github/scripts/e2e-test-llama-windows.bat

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ set /a max=11000
2525
set /a range=max-min+1
2626
set /a PORT=%min% + %RANDOM% %% %range%
2727

28-
rem Kill any existing Nitro processes
29-
echo Killing any existing Nitro processes...
30-
taskkill /f /im nitro.exe 2>nul
31-
3228
rem Start the binary file
3329
start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1
3430

@@ -64,9 +60,9 @@ echo curl_data1=%curl_data1%
6460
echo curl_data2=%curl_data2%
6561

6662
rem Run the curl commands and capture the status code
67-
curl.exe -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
63+
curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
6864

69-
curl.exe -o "%TEMP%\response2.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/chat_completion" ^
65+
curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/chat_completion" ^
7066
--header "Content-Type: application/json" ^
7167
--header "Accept: text/event-stream" ^
7268
--header "Access-Control-Allow-Origin: *" ^
@@ -78,13 +74,13 @@ rem Read the status codes from the log files
7874
for /f %%a in (%TEMP%\response1.log) do set "response1=%%a"
7975
for /f %%a in (%TEMP%\response2.log) do set "response2=%%a"
8076

81-
if "%response1%" neq "000" (
77+
if "%response1%" neq "200" (
8278
echo The first curl command failed with status code: %response1%
8379
type %TEMP%\response1.log
8480
set "error_occurred=1"
8581
)
8682

87-
if "%response2%" neq "000" (
83+
if "%response2%" neq "200" (
8884
echo The second curl command failed with status code: %response2%
8985
type %TEMP%\response2.log
9086
set "error_occurred=1"
@@ -111,4 +107,4 @@ echo Nitro test run successfully!
111107

112108
rem Kill the server process
113109
@REM taskkill /f /pid %pid%
114-
taskkill /f /im nitro.exe 2>nul || exit /B 0
110+
taskkill /f /im nitro.exe 2>nul || exit /B 0

.github/scripts/e2e-test-whisper-windows.bat

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ set /a max=11000
2525
set /a range=max-min+1
2626
set /a PORT=%min% + %RANDOM% %% %range%
2727

28-
rem Kill any existing Nitro processes
29-
echo Killing any existing Nitro processes...
30-
taskkill /f /im nitro.exe 2>nul
31-
3228
rem Start the binary file
3329
start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1
3430

@@ -49,28 +45,21 @@ if not defined pid (
4945

5046
rem Wait for a few seconds to let the server start
5147

52-
rem Check if %TEMP%\testmodel exists, if not, download it
48+
rem Check if %TEMP%\testwhisper exists, if not, download it
5349
if not exist "%MODEL_PATH%" (
5450
bitsadmin.exe /transfer "DownloadTestModel" %DOWNLOAD_URL% "%MODEL_PATH%"
5551
)
5652

5753
rem Define JSON strings for curl data
5854
call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
59-
set "curl_data1={\"model_path\":\"%MODEL_PATH_STRING%\",\"model_id\":\"whisper.cpp\"}"
60-
61-
rem Print the values of curl_data1 for debugging
62-
echo curl_data1=%curl_data1%
55+
set "curl_data1={\"model_path\":\"%MODEL_PATH_STRING%\",\"model_id\":\"whisper\"}"
6356

6457
rem Run the curl commands and capture the status code
65-
curl.exe -o %TEMP%\response1_code.log -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/audio/load_model" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1_code.log 2>&1
58+
curl.exe -o %TEMP%\response1.log -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/audio/load_model" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1_code.log 2>&1
6659

67-
curl.exe -o %TEMP%\response2_code.log -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/audio/transcriptions" ^
68-
--header "Access-Control-Allow-Origin: *" ^
69-
--form 'model_id="whisper.cpp"' ^
70-
--form 'file=@"..\whisper.cpp\samples\jfk.wav"' ^
71-
--form 'temperature="0.0"' ^
72-
--form 'prompt="The transcript is about OpenAI which makes technology like DALL·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity. The president is trying to raly people to support the cause."' ^
73-
> %TEMP%\response2_code.log 2>&1
60+
curl -o %TEMP%\response2.log -s -w "%%{http_code}" --location "http://localhost:%PORT%/v1/audio/transcriptions" ^
61+
--form "file=@../..//whisper.cpp/samples/jfk.wav" ^
62+
--form "model_id=whisper" > %TEMP%\response2_code.log 2>&1
7463

7564
set "error_occurred=0"
7665

@@ -80,13 +69,13 @@ for /f %%a in (%TEMP%\response2_code.log) do set "response2=%%a"
8069

8170
if "%response1%" neq "200" (
8271
echo The first curl command failed with status code: %response1%
83-
type %TEMP%\response1_code.log
72+
type %TEMP%\response1.log
8473
set "error_occurred=1"
8574
)
8675

87-
if "%response2%" neq "000" (
76+
if "%response2%" neq "200" (
8877
echo The second curl command failed with status code: %response2%
89-
type %TEMP%\response2_code.log
78+
type %TEMP%\response2.log
9079
set "error_occurred=1"
9180
)
9281

@@ -101,14 +90,13 @@ if "%error_occurred%"=="1" (
10190

10291
echo ----------------------
10392
echo Log load model:
104-
type %TEMP%\response1_code.log
93+
type %TEMP%\response1.log
10594

10695
echo ----------------------
10796
echo "Log run test:"
108-
type %TEMP%\response2_code.log
97+
type %TEMP%\response2.log
10998

11099
echo Nitro test run successfully!
111100

112101
rem Kill the server process
113-
@REM taskkill /f /pid %pid%
114102
taskkill /f /im nitro.exe 2>nul || exit /B 0

.github/workflows/build.yml

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ on:
5050
env:
5151
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
5252
LLM_MODEL_URL: https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
53-
WHISPER_MODEL_URL: https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin
53+
WHISPER_MODEL_URL: https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q5_1.bin
5454

5555
jobs:
5656
create-draft-release:
@@ -377,11 +377,11 @@ jobs:
377377
run: |
378378
# To test with CoreML
379379
if [[ ! -f "/tmp/testwhisper-encoder.mlmodelc" ]]; then
380-
wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-encoder.mlmodelc.zip
381-
unzip ggml-tiny.en-encoder.mlmodelc.zip
382-
rm ggml-tiny.en-encoder.mlmodelc.zip
380+
wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip
381+
unzip ggml-tiny-encoder.mlmodelc.zip
382+
rm ggml-tiny-encoder.mlmodelc.zip
383383
rm -rf /tmp/testwhisper-encoder.mlmodelc
384-
mv ggml-tiny.en-encoder.mlmodelc /tmp/testwhisper-encoder.mlmodelc
384+
mv ggml-tiny-encoder.mlmodelc /tmp/testwhisper-encoder.mlmodelc
385385
fi
386386
# run e2e testing
387387
cd nitro
@@ -586,7 +586,7 @@ jobs:
586586
cmake --build ./build_deps/nitro_deps --config Release
587587
mkdir -p build
588588
cd build
589-
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
589+
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
590590
cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
591591
592592
- name: Pack artifacts
@@ -602,15 +602,19 @@ jobs:
602602
7z a -ttar temp.tar .\build\Release\*
603603
7z a -tgzip nitro.tar.gz temp.tar
604604
605-
# - name: Run e2e testing - Llama.cpp
606-
# shell: cmd
607-
# run: |
608-
# .\.github\scripts\e2e-test-llama-windows.bat .\build\Release\nitro.exe ${{ env.LLM_MODEL_URL }}
605+
- name: Run e2e testing - Llama.cpp
606+
shell: cmd
607+
run: |
608+
cd build\Release
609+
..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }}
610+
rmdir /S /Q .\build\Release\uploads
609611
610-
# - name: Run e2e testing - Whisper.cpp
611-
# shell: cmd
612-
# run: |
613-
# .\.github\scripts\e2e-test-whisper-windows.bat .\build\Release\nitro.exe ${{ env.WHISPER_MODEL_URL }}
612+
- name: Run e2e testing - Whisper.cpp
613+
shell: cmd
614+
run: |
615+
cd build\Release
616+
..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }}
617+
rmdir /S /Q .\build\Release\uploads
614618
615619
- name: Upload Artifact
616620
uses: actions/upload-artifact@v2
@@ -679,7 +683,7 @@ jobs:
679683
cmake --build ./build_deps/nitro_deps --config Release
680684
mkdir -p build
681685
cd build
682-
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
686+
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
683687
cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
684688
685689
- name: Pack artifacts
@@ -770,7 +774,7 @@ jobs:
770774
cmake --build ./build_deps/nitro_deps --config Release
771775
mkdir -p build
772776
cd build
773-
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DWHISPER_CUBLAS=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
777+
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DWHISPER_CUBLAS=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
774778
cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
775779
776780
- name: Pack artifacts

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ add_executable(${PROJECT_NAME} main.cc)
5959
#
6060
# and comment out the following lines
6161
find_package(Drogon CONFIG REQUIRED)
62-
target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama llava whisper
62+
target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama whisper llava
6363
${CMAKE_THREAD_LIBS_INIT})
6464

6565
# ##############################################################################

README.md

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -92,21 +92,6 @@ curl http://localhost:3928/v1/chat/completions \
9292
}'
9393
```
9494

95-
***OPTIONAL***: You can constrain the sampling using GBNF grammars by providing path to a grammar file
96-
```bash title="Nitro Inference With Grammar"
97-
curl http://localhost:3928/v1/chat/completions \
98-
-H "Content-Type: application/json" \
99-
-d '{
100-
"messages": [
101-
{
102-
"role": "user",
103-
"content": "Who won the world series in 2020?"
104-
},
105-
],
106-
"grammar_file": "/path/to/grammarfile"
107-
}'
108-
```
109-
11095
Table of parameters
11196

11297
| Parameter | Type | Description |
@@ -128,6 +113,7 @@ Table of parameters
128113
|`grp_attn_n`|Integer|Group attention factor in self-extend|
129114
|`grp_attn_w`|Integer|Group attention width in self-extend|
130115
|`mlock`|Boolean|Prevent system swapping of the model to disk in macOS|
116+
|`grammar_file`| String |You can constrain the sampling using GBNF grammars by providing path to a grammar file|
131117

132118
***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal
133119
```zsh
@@ -155,37 +141,37 @@ To compile nitro please visit [Compile from source](docs/docs/new/build-source.m
155141
<tr>
156142
<td style="text-align:center"><b>Stable (Recommended)</b></td>
157143
<td style="text-align:center">
158-
<a href='https://github.com/janhq/nitro/releases/download/v0.3.1/nitro-0.3.1-win-amd64.tar.gz'>
144+
<a href='https://github.com/janhq/nitro/releases/download/v0.3.6/nitro-0.3.6-win-amd64.tar.gz'>
159145
<img src='./docs/static/img/windows.png' style="height:15px; width: 15px" />
160146
<b>CPU</b>
161147
</a>
162148
</td>
163149
<td style="text-align:center">
164-
<a href='https://github.com/janhq/nitro/releases/download/v0.3.1/nitro-0.3.1-win-amd64-cuda.tar.gz'>
150+
<a href='https://github.com/janhq/nitro/releases/download/v0.3.6/nitro-0.3.6-win-amd64-cuda.tar.gz'>
165151
<img src='./docs/static/img/windows.png' style="height:15px; width: 15px" />
166152
<b>CUDA</b>
167153
</a>
168154
</td>
169155
<td style="text-align:center">
170-
<a href='https://github.com/janhq/nitro/releases/download/v0.3.1/nitro-0.3.1-mac-amd64.tar.gz'>
156+
<a href='https://github.com/janhq/nitro/releases/download/v0.3.6/nitro-0.3.6-mac-amd64.tar.gz'>
171157
<img src='./docs/static/img/mac.png' style="height:15px; width: 15px" />
172158
<b>Intel</b>
173159
</a>
174160
</td>
175161
<td style="text-align:center">
176-
<a href='https://github.com/janhq/nitro/releases/download/v0.3.1/nitro-0.3.1-mac-arm64.tar.gz'>
162+
<a href='https://github.com/janhq/nitro/releases/download/v0.3.6/nitro-0.3.6-mac-arm64.tar.gz'>
177163
<img src='./docs/static/img/mac.png' style="height:15px; width: 15px" />
178164
<b>M1/M2</b>
179165
</a>
180166
</td>
181167
<td style="text-align:center">
182-
<a href='https://github.com/janhq/nitro/releases/download/v0.3.1/nitro-0.3.1-linux-amd64.tar.gz'>
168+
<a href='https://github.com/janhq/nitro/releases/download/v0.3.6/nitro-0.3.6-linux-amd64.tar.gz'>
183169
<img src='./docs/static/img/linux.png' style="height:15px; width: 15px" />
184170
<b>CPU</b>
185171
</a>
186172
</td>
187173
<td style="text-align:center">
188-
<a href='https://github.com/janhq/nitro/releases/download/v0.3.1/nitro-0.3.1-linux-amd64-cuda.tar.gz'>
174+
<a href='https://github.com/janhq/nitro/releases/download/v0.3.6/nitro-0.3.6-linux-amd64-cuda.tar.gz'>
189175
<img src='./docs/static/img/linux.png' style="height:15px; width: 15px" />
190176
<b>CUDA</b>
191177
</a>
@@ -194,7 +180,7 @@ To compile nitro please visit [Compile from source](docs/docs/new/build-source.m
194180
<tr style="text-align: center">
195181
<td style="text-align:center"><b>Experimental (Nighlty Build)</b></td>
196182
<td style="text-align:center" colspan="6">
197-
<a href='https://github.com/janhq/nitro/actions/runs/7701765982'>
183+
<a href='https://github.com/janhq/nitro/actions/runs/7790169166'>
198184
<b>GitHub action artifactory</b>
199185
</a>
200186
</td>

0 commit comments

Comments
 (0)