diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..0769b6f --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,23 @@ +name: Tests + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + test: + runs-on: self-hosted # Requires a self-hosted runner with a GPU and model weights. + # Register one at: Settings -> Actions -> Runners -> New self-hosted runner. + + steps: + - uses: actions/checkout@v4 + + - name: Install package + run: uv sync --extra dev + + - name: Run tests + run: uv run pytest -v + env: + LLM_MODEL_CACHE: ${{ secrets.LLM_MODEL_CACHE }} diff --git a/README.md b/README.md index 901051d..54d5338 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,13 @@ # llm-server -Host an LLM as a non-blocking HTTP server with a simple interface for managing and querying models. Built on [FastAPI](https://fastapi.tiangolo.com/) and [uvicorn](https://www.uvicorn.org/), it wraps the [large-language-model](https://github.com/EricApgar/large-language-model) library to expose LLM inference over a REST API. +Host an LLM on a non-blocking HTTP server with a simple interface for managing and querying models. This allows you to host an LLM on LLM-capable harware and then access it over the network from devices that can't run an LLM locally. + +Built on [FastAPI](https://fastapi.tiangolo.com/) and [uvicorn](https://www.uvicorn.org/), it wraps the [large-language-model](https://github.com/EricApgar/large-language-model) library to expose LLM inference over a REST API. + +A user friendly GUI interface is included (but **not** required) for general use. + +
+ LLM Server GUI +
# Setup See [Releases](https://github.com/EricApgar/llm-server/releases) to install from wheel file. @@ -7,7 +15,7 @@ See [Releases](https://github.com/EricApgar/llm-server/releases) to install from See ```pyproject.toml``` for required Python version and dependencies. ## Optional Dependencies -This library uses optional dependencies for additional features. See the ```pyproject.toml``` for the list of optional library tags. To install with the GUI support, use the ```gui``` tag. +This library uses optional dependencies for additional features. See the ```pyproject.toml``` for the list of optional library tags. To install with GUI support, use the ```gui``` tag. ## Library Install this repo as a library into another project. @@ -60,8 +68,9 @@ pip install -e ".[gui]" Running an LLM requires an NVIDIA GPU with ideally a large number of TOPS. See the [large-language-model](https://github.com/EricApgar/large-language-model) repo for hardware details and GPU driver setup. # Usage -Create a server, register a model with a tag, load it, and start serving. +Create a server, register a model with a tag, load it, and start serving. Then send a request and receive a response. +## Server ``` import llm_server @@ -75,22 +84,46 @@ server.start() # Non-blocking. server.stop() ``` +## Requester +``` +URL = 'http://127.0.0.1:8001/ask' +details = {...} # See request body examples below. + +response = requests.post(URL, json=details, timeout=15) +data = response.json() +print(data['text']) +``` + # API Endpoints | Method | Endpoint | Description | |-|-|-| -| GET | `/` | Health check — returns `"Running."` | -| GET | `/get-models` | List all registered models and their tags | -| GET | `/ask-test` | Send a test prompt (`"Tell me a joke."`) to the first available model | -| POST | `/ask` | Send a prompt to a specific model | +| GET | `/` | Health check — returns `"Running."`. | +| GET | `/get-models` | List all available hosted models and their tags. | +| GET | `/ask-test` | Send a test prompt (`"Tell me a joke."`) to the first available model. | +| POST | `/ask` | Send a prompt to a specific model. | ### POST /ask — Request Body + +#### Text Example ``` { "tag": "gpt", "prompt": "Tell me a joke.", - "images": [], "max_tokens": 64, "temperature": 0.9 } ``` -`prompt` may also be a conversation dict (see [llm-conversation](https://github.com/EricApgar/llm-conversation)). `images` accepts base64-encoded PNG strings for multimodal models. +* `prompt` may also be a Conversation formatted dict output (see [llm-conversation](https://github.com/EricApgar/llm-conversation)). + * ```llm_conversation.Conversation.to_dict()``` + +#### Image Example +``` +{ + "tag": "Phi4", + "prompt": "Describe the image.", + "images": [llm_server.encode_image()], + "max_tokens": 64, +} +``` +* ```temperature``` arg currently not supported for Phi-4-multimodal-instruct. +* ```images``` accepts base64-encoded PNG strings for multimodal models. Encoder is provided by ```llm_server```. \ No newline at end of file diff --git a/assets/LLM Server GUI.png b/assets/LLM Server GUI.png new file mode 100644 index 0000000..afa2e1a Binary files /dev/null and b/assets/LLM Server GUI.png differ diff --git a/pyproject.toml b/pyproject.toml index ff0d341..dbc72d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "llm_server" # The pip install . -version = "0.1.0" +version = "0.2.0" description = "Template for repos that are intended to be packaged libraries." readme = "README.md" authors = [{ name = "Eric Apgar" }] @@ -17,6 +17,10 @@ dependencies = [ gui = [ "nicegui>=3.8.0", ] +dev = [ + "pytest>=8.0.0", + "requests>=2.32.0", +] [tool.uv.sources] llm = { git = "https://github.com/EricApgar/large-language-model" } diff --git a/scripts/client_test.py b/scripts/client_test.py index 099c13a..9e10d87 100644 --- a/scripts/client_test.py +++ b/scripts/client_test.py @@ -1,45 +1,42 @@ -import json import requests -from io import BytesIO -import base64 from PIL import Image as PillowImage from llm_conversation import Conversation -from llm_server.gui_app import run_gui +from llm_server.helper.helper import encode_image -def pil_to_api_b64(pil_image: PillowImage.Image) -> str: - buffer = BytesIO() - pil_image.save(buffer, format='PNG') +# TODO: Point to endpoint of hosted llm_server.Server(). +URL = 'http://127.0.0.1:8001/ask' - return base64.b64encode(buffer.getvalue()).decode('ascii') - - -URL = 'https://127.0.0.1:8000/ask' - -# images = [pil_to_api_b64(PillowImage.open(''))] +# TODO: Edit as needed. +images = [encode_image(image=PillowImage.open(r'/home/eric/Desktop/monkey.png'))] +# TODO: Edit as needed. c = Conversation() c.set_overall_prompt(text='Pretend to be a person named John Doe.') c.add_context(text='Your favorite color is onyx.') c.add_response(role='user', text='Whats your name and favorite color?') prompt = c.to_dict() -REQUEST_DETAILS = { +# TODO: Edit as needed. +REQUEST_DETAILS_TEXT = { 'tag': 'GPT', - 'prompt': prompt, #'Name a primary color.', - # 'images': images, - 'max_tokens': 64, - 'temperature': 0.9,} + 'prompt': prompt, + 'temperature': .9} + +REQUEST_DETAILS_IMAGE = { + 'tag': 'Phi-4', + 'prompt': 'Describe the image.', + 'images': images} def main() -> None: try: - response = requests.post(URL, json=REQUEST_DETAILS, timeout=15) + response = requests.post(URL, json=REQUEST_DETAILS_TEXT, timeout=15) data = response.json() - print(json.dumps(data, indent=4)) + print(data['text']) except Exception as e: raise(e) @@ -49,6 +46,6 @@ def main() -> None: if __name__ in {'__main__', '__mp_main__'}: - run_gui() - - # main() + # TODO: Make sure a server instance (llm_server.Server() or + # llm_server.server_gui()) is running before running main(). + main() diff --git a/src/llm_server/__init__.py b/src/llm_server/__init__.py index 85fb36b..ca2afd0 100644 --- a/src/llm_server/__init__.py +++ b/src/llm_server/__init__.py @@ -1,16 +1,18 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING __all__ = [ 'Server', - 'server_gui', + 'run_gui', + 'encode_image', ] if TYPE_CHECKING: from .server import Server - from .gui_app import run_gui as server_gui + from .gui_app import run_gui as run_gui + from .helper.helper import encode_image def __getattr__(name: str): @@ -18,10 +20,14 @@ def __getattr__(name: str): from .server import Server globals()[name] = Server return Server - elif name == 'server_gui': - from .gui_app import run_gui as server_gui - globals()[name] = server_gui - return server_gui + elif name == 'run_gui': + from .gui_app import run_gui + globals()[name] = run_gui + return run_gui + elif name == 'encode_image': + from .helper.helper import encode_image + globals()[name] = encode_image + return encode_image else: raise AttributeError(f'module {__name__!r} has no attribute {name!r}') diff --git a/src/llm_server/backend.py b/src/llm_server/backend.py index 6cf002f..82e514e 100644 --- a/src/llm_server/backend.py +++ b/src/llm_server/backend.py @@ -110,6 +110,9 @@ def ask(self, details: Request) -> str: else: del input_args['images'] + # Remove unset (None) fields so defaults are used downstream. + input_args = {k: v for k, v in input_args.items() if v is not None} + if isinstance(input_args['prompt'], dict): c = Conversation() c.from_dict(data=input_args['prompt']) diff --git a/src/llm_server/gui_app.py b/src/llm_server/gui_app.py index 53ef34c..9e7308c 100644 --- a/src/llm_server/gui_app.py +++ b/src/llm_server/gui_app.py @@ -1,7 +1,6 @@ import queue import weakref import re -from dataclasses import dataclass import socket from nicegui import ui, run diff --git a/src/llm_server/helper/helper.py b/src/llm_server/helper/helper.py index 7b2b53c..026645a 100644 --- a/src/llm_server/helper/helper.py +++ b/src/llm_server/helper/helper.py @@ -1,5 +1,21 @@ +from io import BytesIO +import base64 from typing import Optional, overload +from PIL import Image as PillowImage + + +def encode_image(image: PillowImage.Image) -> str: + """ + Encode a PIL image as a base64 string for API transport. + """ + + buffer = BytesIO() + image.save(buffer, format='PNG') + encoded_output = base64.b64encode(buffer.getvalue()).decode('ascii') + + return encoded_output + class Endpoint: diff --git a/src/llm_server/schemas.py b/src/llm_server/schemas.py index 3b12b28..3b5405d 100644 --- a/src/llm_server/schemas.py +++ b/src/llm_server/schemas.py @@ -6,7 +6,7 @@ class Request(BaseModel): prompt: str | dict = Field(..., description='LLM prompt or conversation (llm-conversation.to_dict).') images: list[str] | None = Field(default=None, description='Each image should be base64-encoded image bytes.', examples=[None]) max_tokens: int = Field(default=256, description='Max token count for generation.') - temperature: float = Field(default=0.9, description='Temperature of generated response.') + temperature: float | None = Field(default=None, description='Temperature of generated response.') class Response(BaseModel): diff --git a/tests/test_server.py b/tests/test_server.py new file mode 100644 index 0000000..4db6d9c --- /dev/null +++ b/tests/test_server.py @@ -0,0 +1,65 @@ +""" +Tests for llm_server.Server. + +These tests can load a real model when hosting the server, so if running a full +test they require local model weights. Sync the environment first, then pass +LLM_MODEL_CACHE inline when invoking pytest so it only exists for that one +command and does not persist in your shell: + + uv sync --extra dev + LLM_MODEL_CACHE= pytest + +Tests are skipped automatically if LLM_MODEL_CACHE is not set. + +Make sure the virtual environment is active. +""" + +import os +import time + +import pytest +import requests + +import llm_server + + +MODEL_CACHE = os.environ.get('LLM_MODEL_CACHE') + + +@pytest.fixture(scope='module') +def server(): + s = llm_server.Server() + s.set_host(ip_address='127.0.0.1', port=8001) + s.start() + time.sleep(1) # Allow uvicorn to finish starting. + yield s + s.stop() + + +@pytest.fixture(scope='module') +def server_with_model(): + if not MODEL_CACHE: + pytest.skip('LLM_MODEL_CACHE environment variable not set.') + s = llm_server.Server() + s.set_host(ip_address='127.0.0.1', port=8002) + s.add_model(tag='gpt', name='openai/gpt-oss-20b') + s.load_model(tag='gpt', location=MODEL_CACHE) + s.start() + time.sleep(1) # Allow uvicorn to finish starting. + yield s + s.stop() + + +def test_server_running(server): + response = requests.get('http://127.0.0.1:8001/', timeout=5) + assert response.json() == 'Running.' + + +def test_ask(server_with_model): + response = requests.post( + 'http://127.0.0.1:8002/ask', + json={'tag': 'gpt', 'prompt': 'Name a primary color.'}, + timeout=60) + data = response.json() + assert isinstance(data['text'], str) + assert len(data['text']) > 0 diff --git a/uv.lock b/uv.lock index 2b62547..45941f9 100644 --- a/uv.lock +++ b/uv.lock @@ -445,11 +445,10 @@ wheels = [ [[package]] name = "llm" -version = "0.4.2" -source = { git = "https://github.com/EricApgar/large-language-model#275714b6a3c52eb7ce436e7707a30b29de5ff162" } +version = "0.4.3" +source = { git = "https://github.com/EricApgar/large-language-model#4f85188e9b30b91caf7bdc64e53bbf4e905e8e43" } dependencies = [ { name = "llm-conversation" }, - { name = "pytest" }, { name = "sentence-transformers" }, { name = "torch" }, { name = "torchvision" }, @@ -473,7 +472,7 @@ source = { git = "https://github.com/EricApgar/llm-conversation#ce9d59af41f4a018 [[package]] name = "llm-server" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "fastapi" }, @@ -482,6 +481,10 @@ dependencies = [ ] [package.optional-dependencies] +dev = [ + { name = "pytest" }, + { name = "requests" }, +] gui = [ { name = "nicegui" }, ] @@ -491,9 +494,11 @@ requires-dist = [ { name = "fastapi", specifier = ">=0.135.1" }, { name = "llm", extras = ["all"], git = "https://github.com/EricApgar/large-language-model" }, { name = "nicegui", marker = "extra == 'gui'", specifier = ">=3.8.0" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, + { name = "requests", marker = "extra == 'dev'", specifier = ">=2.32.0" }, { name = "uvicorn", specifier = ">=0.41.0" }, ] -provides-extras = ["gui"] +provides-extras = ["gui", "dev"] [[package]] name = "markdown2" @@ -1295,9 +1300,9 @@ dependencies = [ { name = "torch" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu130/torchvision-0.25.0%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3646e6c8fa5066da392d0ff13002cc683301386fe1933f8f1432fc5292e5d288" }, - { url = "https://download.pytorch.org/whl/cu130/torchvision-0.25.0%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c2f5e38f0cd57a2796e4503c0f13365deba01dbc167ef820f0beec7ca96f5f2e" }, - { url = "https://download.pytorch.org/whl/cu130/torchvision-0.25.0%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:7dd245ee7df0ceb00125e57615de31ca7232bf046143c2c3fe7a3b321bb50958" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.25.0%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3646e6c8fa5066da392d0ff13002cc683301386fe1933f8f1432fc5292e5d288" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.25.0%2Bcu130-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c2f5e38f0cd57a2796e4503c0f13365deba01dbc167ef820f0beec7ca96f5f2e" }, + { url = "https://download-r2.pytorch.org/whl/cu130/torchvision-0.25.0%2Bcu130-cp312-cp312-win_amd64.whl", hash = "sha256:7dd245ee7df0ceb00125e57615de31ca7232bf046143c2c3fe7a3b321bb50958" }, ] [[package]]