From 5cc50bd88697290d99edc075d856bbba97d43cfc Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Fri, 31 Oct 2025 20:49:26 +0430 Subject: [PATCH 1/3] [refactor] chore: migrate from `typing.{List,Union,Optional,Dict,Set}` to builtin syntax --- backends/base_model_container.py | 34 +++--- backends/exllamav2/grammar.py | 7 +- backends/exllamav2/model.py | 87 +++++++------- backends/exllamav3/model.py | 65 +++++------ backends/exllamav3/sampler.py | 3 +- backends/infinity/model.py | 7 +- colab/TabbyAPI_Colab_Example.ipynb | 6 +- common/args.py | 3 +- common/auth.py | 5 +- common/config_models.py | 132 +++++++++++----------- common/downloader.py | 23 ++-- common/gen_logging.py | 5 +- common/health.py | 3 +- common/model.py | 7 +- common/multimodal.py | 3 +- common/networking.py | 3 +- common/sampling.py | 93 ++++++++------- common/tabby_config.py | 5 +- common/templating.py | 9 +- common/transformers_utils.py | 17 ++- common/utils.py | 18 +-- config_sample.yml | 14 +-- docker/Dockerfile | 4 +- docs/01.-Getting-Started.md | 2 +- docs/02.-Server-options.md | 20 ++-- docs/06.-Sharing.md | 2 +- endpoints/Kobold/types/generation.py | 9 +- endpoints/Kobold/types/token.py | 3 +- endpoints/OAI/router.py | 4 +- endpoints/OAI/types/chat_completion.py | 56 ++++----- endpoints/OAI/types/common.py | 31 +++-- endpoints/OAI/types/completion.py | 19 ++-- endpoints/OAI/types/embedding.py | 16 ++- endpoints/OAI/types/tools.py | 4 +- endpoints/OAI/utils/chat_completion.py | 25 ++-- endpoints/OAI/utils/completion.py | 11 +- endpoints/OAI/utils/tools.py | 13 +-- endpoints/core/router.py | 7 +- endpoints/core/types/download.py | 15 ++- endpoints/core/types/health.py | 2 +- endpoints/core/types/lora.py | 13 +-- endpoints/core/types/model.py | 76 +++++++------ endpoints/core/types/sampler_overrides.py | 7 +- endpoints/core/types/template.py | 3 +- endpoints/core/types/token.py | 7 +- endpoints/core/utils/model.py | 5 +- endpoints/server.py | 3 +- main.py | 15 ++- start.py | 7 +- 49 files changed, 446 insertions(+), 482 deletions(-) diff --git a/backends/base_model_container.py b/backends/base_model_container.py index 96393aba..cc0c5c1e 100644 --- a/backends/base_model_container.py +++ b/backends/base_model_container.py @@ -2,13 +2,7 @@ import asyncio import pathlib from loguru import logger -from typing import ( - Any, - AsyncIterator, - Dict, - List, - Optional, -) +from typing import Any, AsyncIterator from common.multimodal import MultimodalEmbeddingWrapper from common.sampling import BaseSamplerRequest from common.templating import PromptTemplate @@ -21,7 +15,7 @@ class BaseModelContainer(abc.ABC): # Exposed model information model_dir: pathlib.Path = pathlib.Path("models") - prompt_template: Optional[PromptTemplate] = None + prompt_template: PromptTemplate | None = None # HF Model instance hf_model: HFModel @@ -34,7 +28,7 @@ class BaseModelContainer(abc.ABC): # The bool is a master switch for accepting requests # The lock keeps load tasks sequential # The condition notifies any waiting tasks - active_job_ids: Dict[str, Any] = {} + active_job_ids: dict[str, Any] = {} loaded: bool = False load_lock: asyncio.Lock load_condition: asyncio.Condition @@ -98,7 +92,7 @@ async def unload(self, loras_only: bool = False, **kwargs): pass @abc.abstractmethod - def encode_tokens(self, text: str, **kwargs) -> List[int]: + def encode_tokens(self, text: str, **kwargs) -> list[int]: """ Encodes a string of text into a list of token IDs. @@ -113,7 +107,7 @@ def encode_tokens(self, text: str, **kwargs) -> List[int]: pass @abc.abstractmethod - def decode_tokens(self, ids: List[int], **kwargs) -> str: + def decode_tokens(self, ids: list[int], **kwargs) -> str: """ Decodes a list of token IDs back into a string. @@ -128,7 +122,7 @@ def decode_tokens(self, ids: List[int], **kwargs) -> str: pass @abc.abstractmethod - def get_special_tokens(self) -> Dict[str, Any]: + def get_special_tokens(self) -> dict[str, Any]: """ Gets special tokens used by the model/tokenizer. @@ -164,7 +158,7 @@ async def wait_for_jobs(self, skip_wait: bool = False): # Optional methods async def load_loras( self, lora_directory: pathlib.Path, **kwargs - ) -> Dict[str, List[str]]: + ) -> dict[str, list[str]]: """ Loads LoRA adapters. Base implementation does nothing or raises error. @@ -184,7 +178,7 @@ async def load_loras( ], } - def get_loras(self) -> List[Any]: + def get_loras(self) -> list[Any]: """ Gets the currently loaded LoRA adapters. Base implementation returns empty list. @@ -200,9 +194,9 @@ async def generate( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, - ) -> Dict[str, Any]: + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, + ) -> dict[str, Any]: """ Generates a complete response for a given prompt and parameters. @@ -225,9 +219,9 @@ async def stream_generate( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, - ) -> AsyncIterator[Dict[str, Any]]: + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, + ) -> AsyncIterator[dict[str, Any]]: """ Generates a response iteratively (streaming) for a given prompt. diff --git a/backends/exllamav2/grammar.py b/backends/exllamav2/grammar.py index f9bb464c..2605bec3 100644 --- a/backends/exllamav2/grammar.py +++ b/backends/exllamav2/grammar.py @@ -1,7 +1,6 @@ import traceback -import typing from functools import lru_cache -from typing import List +from typing import Any import torch from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer @@ -16,7 +15,7 @@ class ExLlamaV2Grammar: """ExLlamaV2 class for various grammar filters/parsers.""" - filters: List[ExLlamaV2Filter] + filters: list[ExLlamaV2Filter] def __init__(self): self.filters = [] @@ -123,7 +122,7 @@ def __init__(self, nonterminal: str, kbnf_string: str): self.kbnf_string = kbnf_string # Return the entire input string as the extracted string - def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]: + def extract(self, input_str: str) -> tuple[str, Any] | None: return "", input_str @property diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index ae71b00f..903b4929 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -25,7 +25,6 @@ ) from itertools import zip_longest from loguru import logger -from typing import Dict, List, Optional from backends.base_model_container import BaseModelContainer from backends.exllamav2.grammar import ( @@ -58,45 +57,45 @@ class ExllamaV2Container(BaseModelContainer): # Model directories model_dir: pathlib.Path = pathlib.Path("models") draft_model_dir: pathlib.Path = pathlib.Path("models") - prompt_template: Optional[PromptTemplate] = None + prompt_template: PromptTemplate | None = None # HF model instance hf_model: HFModel # Exl2 vars - config: Optional[ExLlamaV2Config] = None - model: Optional[ExLlamaV2] = None - cache: Optional[ExLlamaV2Cache] = None - tokenizer: Optional[ExLlamaV2Tokenizer] = None - generator: Optional[ExLlamaV2DynamicGeneratorAsync] = None - prompt_template: Optional[PromptTemplate] = None + config: ExLlamaV2Config | None = None + model: ExLlamaV2 | None = None + cache: ExLlamaV2Cache | None = None + tokenizer: ExLlamaV2Tokenizer | None = None + generator: ExLlamaV2DynamicGeneratorAsync | None = None + prompt_template: PromptTemplate | None = None paged: bool = True # Draft model vars use_draft_model: bool = False - draft_config: Optional[ExLlamaV2Config] = None - draft_model: Optional[ExLlamaV2] = None - draft_cache: Optional[ExLlamaV2Cache] = None + draft_config: ExLlamaV2Config | None = None + draft_model: ExLlamaV2 | None = None + draft_cache: ExLlamaV2Cache | None = None # Internal config vars cache_size: int = None cache_mode: str = "FP16" draft_cache_mode: str = "FP16" - max_batch_size: Optional[int] = None + max_batch_size: int | None = None # GPU split vars - gpu_split: List[float] = [] - draft_gpu_split: List[float] = [] + gpu_split: list[float] = [] + draft_gpu_split: list[float] = [] gpu_split_auto: bool = True - autosplit_reserve: List[float] = [96 * 1024**2] + autosplit_reserve: list[float] = [96 * 1024**2] use_tp: bool = False # Vision vars use_vision: bool = False - vision_model: Optional[ExLlamaV2VisionTower] = None + vision_model: ExLlamaV2VisionTower | None = None # Load synchronization - active_job_ids: Dict[str, Optional[ExLlamaV2DynamicJobAsync]] = {} + active_job_ids: dict[str, ExLlamaV2DynamicJobAsync | None] = {} loaded: bool = False load_lock: asyncio.Lock = asyncio.Lock() load_condition: asyncio.Condition = asyncio.Condition() @@ -130,7 +129,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Check if the model arch is compatible with various exl2 features self.config.arch_compat_overrides() - # Set vision state and error if vision isn't supported on the current model + # set vision state and error if vision isn't supported on the current model self.use_vision = unwrap(kwargs.get("vision"), False) if self.use_vision and not self.config.vision_model_type: raise ValueError( @@ -185,12 +184,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs gpu_split = unwrap(kwargs.get("gpu_split"), []) gpu_device_list = list(range(0, gpu_count)) - # Set GPU split options + # set GPU split options if gpu_count == 1: self.gpu_split_auto = False logger.info("Disabling GPU split because one GPU is in use.") else: - # Set tensor parallel + # set tensor parallel if use_tp: self.use_tp = True @@ -233,7 +232,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Hardcode max output length to 16 self.config.max_output_len = 16 - # Set max batch size to the config override + # set max batch size to the config override self.max_batch_size = unwrap(kwargs.get("max_batch_size")) # Check whether the user's configuration supports flash/paged attention @@ -262,7 +261,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Grab user-set max seq len user_max_seq_len = kwargs.get("max_seq_len") - # Set k/v cache size + # set k/v cache size # cache_size is only relevant when paged mode is enabled if self.paged: user_cache_size = coalesce(kwargs.get("cache_size"), user_max_seq_len, 4096) @@ -273,7 +272,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs user_max_seq_len, min(hf_model.hf_config.max_position_embeddings, 4096) ) - # Set the rope scale + # set the rope scale self.config.scale_pos_emb = unwrap( kwargs.get("rope_scale"), self.config.scale_pos_emb ) @@ -322,7 +321,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs self.config.max_input_len = chunk_size self.config.max_attention_size = chunk_size**2 - # Set user-configured draft model values + # set user-configured draft model values if self.use_draft_model: self.draft_config.max_seq_len = self.config.max_seq_len @@ -330,7 +329,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs draft_args.get("draft_rope_scale"), 1.0 ) - # Set draft rope alpha. Follows same behavior as model rope alpha. + # set draft rope alpha. Follows same behavior as model rope alpha. # Use the max_position_embeddings of the model draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto") if draft_rope_alpha == "auto": @@ -341,7 +340,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs else: self.draft_config.scale_alpha_value = draft_rope_alpha - # Set draft cache mode + # set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") # Catch exllamav3 draft_cache_mode @@ -750,9 +749,9 @@ async def load_loras(self, lora_directory: pathlib.Path, **kwargs): # Wait for existing generation jobs to finish await self.wait_for_jobs(kwargs.get("skip_wait")) - loras_to_load: List[ExLlamaV2Lora] = [] - success: List[str] = [] - failure: List[str] = [] + loras_to_load: list[ExLlamaV2Lora] = [] + success: list[str] = [] + failure: list[str] = [] for lora in loras: lora_name = lora.get("name") @@ -836,7 +835,7 @@ async def unload(self, loras_only: bool = False, **kwargs): await self.generator.close() self.generator = None - # Set all model state variables to False + # set all model state variables to False self.loaded = False gc.collect() @@ -869,7 +868,7 @@ def encode_tokens(self, text: str, **kwargs): .tolist() ) - def decode_tokens(self, ids: List[int], **kwargs): + def decode_tokens(self, ids: list[int], **kwargs): """Wrapper to decode tokens from a list of IDs""" ids = torch.tensor([ids]) @@ -908,8 +907,8 @@ async def generate( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, ): """Generate a response to a prompt.""" generations = [] @@ -969,8 +968,8 @@ async def stream_generate( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, ): try: # Wait for load lock to be freed before processing @@ -1136,15 +1135,15 @@ def assign_gen_params( "top_k, top_p, and typical to 1.0, 1, 0, and 0." ) - # Set banned tokens + # set banned tokens if params.banned_tokens: gen_settings.disallow_tokens(self.tokenizer, params.banned_tokens) - # Set allowed tokens + # set allowed tokens if params.allowed_tokens: gen_settings.allow_tokens(self.tokenizer, params.allowed_tokens) - # Set logit bias + # set logit bias if params.logit_bias: # Create a vocab tensor if it doesn't exist for token biasing if gen_settings.token_bias is None: @@ -1242,8 +1241,8 @@ async def generate_gen( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, ): """ Create generator function for prompt completion. @@ -1261,7 +1260,7 @@ async def generate_gen( grammar_handler, ) - # Set banned strings + # set banned strings banned_strings = params.banned_strings if banned_strings and len(grammar_handler.filters) > 0: logger.warning( @@ -1271,7 +1270,7 @@ async def generate_gen( banned_strings = [] - # Set CFG scale and negative prompt + # set CFG scale and negative prompt cfg_scale = params.cfg_scale negative_prompt = None if cfg_scale not in [None, 1.0]: @@ -1301,7 +1300,7 @@ async def generate_gen( stop_conditions = params.stop ban_eos_token = params.ban_eos_token - # Set add_bos_token for generation + # set add_bos_token for generation add_bos_token = unwrap(params.add_bos_token, self.hf_model.add_bos_token()) # Fetch EOS tokens from the HF model if they exist @@ -1309,7 +1308,7 @@ async def generate_gen( # Ban the EOS token if specified. If not, append to stop conditions # as well. - # Set this below logging to avoid polluting the stop strings array + # set this below logging to avoid polluting the stop strings array if ban_eos_token: gen_settings.disallow_tokens(self.tokenizer, eos_tokens) else: diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index d385de7c..4b369c7f 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -7,9 +7,6 @@ from typing import ( Any, AsyncIterator, - Dict, - List, - Optional, ) from exllamav3 import ( @@ -49,7 +46,7 @@ class ExllamaV3Container(BaseModelContainer): # Exposed model information model_dir: pathlib.Path = pathlib.Path("models") - prompt_template: Optional[PromptTemplate] = None + prompt_template: PromptTemplate | None = None # HF Model instance hf_model: HFModel @@ -58,26 +55,26 @@ class ExllamaV3Container(BaseModelContainer): # The bool is a master switch for accepting requests # The lock keeps load tasks sequential # The condition notifies any waiting tasks - active_job_ids: Dict[str, Any] = {} + active_job_ids: dict[str, Any] = {} loaded: bool = False load_lock: asyncio.Lock = asyncio.Lock() load_condition: asyncio.Condition = asyncio.Condition() # Exl3 vars - model: Optional[Model] = None - cache: Optional[Cache] = None - draft_model: Optional[Model] = None - draft_cache: Optional[Cache] = None - tokenizer: Optional[Tokenizer] = None - config: Optional[Config] = None - draft_config: Optional[Config] = None - generator: Optional[AsyncGenerator] = None - vision_model: Optional[Model] = None + model: Model | None = None + cache: Cache | None = None + draft_model: Model | None = None + draft_cache: Cache | None = None + tokenizer: Tokenizer | None = None + config: Config | None = None + draft_config: Config | None = None + generator: AsyncGenerator | None = None + vision_model: Model | None = None # Class-specific vars - gpu_split: Optional[List[float]] = None + gpu_split: list[float] | None = None gpu_split_auto: bool = True - autosplit_reserve: Optional[List[float]] = [96 / 1024] + autosplit_reserve: list[float] | None = [96 / 1024] use_tp: bool = False tp_backend: str = "native" max_seq_len: int = 4096 @@ -85,8 +82,8 @@ class ExllamaV3Container(BaseModelContainer): cache_mode: str = "FP16" draft_cache_mode: str = "FP16" chunk_size: int = 2048 - max_rq_tokens: Optional[int] = 2048 - max_batch_size: Optional[int] = None + max_rq_tokens: int | None = 2048 + max_batch_size: int | None = None # Required methods @classmethod @@ -160,12 +157,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs gpu_device_list = list(range(0, gpu_count)) use_tp = unwrap(kwargs.get("tensor_parallel"), False) - # Set GPU split options + # set GPU split options if gpu_count == 1: self.gpu_split_auto = False logger.info("Disabling GPU split because one GPU is in use.") else: - # Set tensor parallel + # set tensor parallel if use_tp: self.use_tp = True tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native") @@ -182,7 +179,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # TP has its own autosplit loader self.gpu_split_auto = False - # Set GPU split options + # set GPU split options # Enable manual GPU split if provided if gpu_split: self.gpu_split_auto = False @@ -238,7 +235,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Draft cache if self.use_draft_model: - # Set draft cache mode + # set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") self.draft_cache = self.create_cache( self.draft_cache_mode, self.draft_model @@ -579,7 +576,7 @@ async def unload(self, loras_only: bool = False, **kwargs): async with self.load_condition: self.load_condition.notify_all() - def encode_tokens(self, text: str, **kwargs) -> List[int]: + def encode_tokens(self, text: str, **kwargs) -> list[int]: """ Encodes a string of text into a list of token IDs. @@ -607,7 +604,7 @@ def encode_tokens(self, text: str, **kwargs) -> List[int]: .tolist() ) - def decode_tokens(self, ids: List[int], **kwargs) -> str: + def decode_tokens(self, ids: list[int], **kwargs) -> str: """ Decodes a list of token IDs back into a string. @@ -666,9 +663,9 @@ async def generate( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, - ) -> Dict[str, Any]: + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, + ) -> dict[str, Any]: """ Generates a complete response for a given prompt and parameters. @@ -738,9 +735,9 @@ async def stream_generate( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, - ) -> AsyncIterator[Dict[str, Any]]: + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, + ) -> AsyncIterator[dict[str, Any]]: """ Generates a response iteratively (streaming) for a given prompt. @@ -859,8 +856,8 @@ async def generate_gen( request_id: str, prompt: str, params: BaseSamplerRequest, - abort_event: Optional[asyncio.Event] = None, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, + abort_event: asyncio.Event | None = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, ): """ Create generator function for prompt completion. @@ -873,7 +870,7 @@ async def generate_gen( # Penalties - # Set penalty range + # set penalty range penalty_range = unwrap(params.penalty_range, self.max_seq_len) # Exl3's version of including the entire context @@ -914,7 +911,7 @@ async def generate_gen( sampler_builder.temperature(params.temperature) # Build the sampler - # Set greedy if temperature is 0 + # set greedy if temperature is 0 sampler = sampler_builder.build(params.temperature == 0) # Dynamically scale penalty range to output tokens diff --git a/backends/exllamav3/sampler.py b/backends/exllamav3/sampler.py index 7b08a9b1..eef7d944 100644 --- a/backends/exllamav3/sampler.py +++ b/backends/exllamav3/sampler.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import List from exllamav3.generator.sampler import ( CustomSampler, SS_Temperature, @@ -20,7 +19,7 @@ class ExllamaV3SamplerBuilder: Custom sampler chain/stack for TabbyAPI """ - stack: List[SS_Base] = field(default_factory=list) + stack: list[SS_Base] = field(default_factory=list) def penalties(self, rep_p, freq_p, pres_p, penalty_range, rep_decay): self.stack += [ diff --git a/backends/infinity/model.py b/backends/infinity/model.py index c131e3cb..e1dd9b1b 100644 --- a/backends/infinity/model.py +++ b/backends/infinity/model.py @@ -1,8 +1,9 @@ +from __future__ import annotations + import gc import pathlib import torch from loguru import logger -from typing import List, Optional from common.utils import unwrap from common.optional_dependencies import dependencies @@ -17,7 +18,7 @@ class InfinityContainer: loaded: bool = False # Use a runtime type hint here - engine: Optional["AsyncEmbeddingEngine"] = None + engine: AsyncEmbeddingEngine | None = None def __init__(self, model_directory: pathlib.Path): self.model_dir = model_directory @@ -49,7 +50,7 @@ async def unload(self): logger.info("Embedding model unloaded.") - async def generate(self, sentence_input: List[str]): + async def generate(self, sentence_input: list[str]): result_embeddings, usage = await self.engine.embed(sentence_input) return {"embeddings": result_embeddings, "usage": usage} diff --git a/colab/TabbyAPI_Colab_Example.ipynb b/colab/TabbyAPI_Colab_Example.ipynb index b8e32f0a..c23bd345 100644 --- a/colab/TabbyAPI_Colab_Example.ipynb +++ b/colab/TabbyAPI_Colab_Example.ipynb @@ -184,13 +184,13 @@ " # Only use if your model was trained on long context with rope (check config.json)\n", " rope_alpha: {RopeAlpha}\n", "\n", - " # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n", + " # Disable Flash-attention 2. set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n", " no_flash_attention: {NoFlashAttention}\n", "\n", " # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n", " cache_mode: {CacheMode}\n", "\n", - " # Set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n", + " # set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n", " # NOTE: Only works with chat completion message lists!\n", " prompt_template: {PromptTemplate}\n", "\n", @@ -218,7 +218,7 @@ " # Overrides the directory to look for loras (default: loras)\n", " lora_dir: loras\n", "\n", - " # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n", + " # list of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n", " loras:\n", " - name: {lora}\n", " scaling: {LoraScaling}\n", diff --git a/common/args.py b/common/args.py index a0da4f98..cd286877 100644 --- a/common/args.py +++ b/common/args.py @@ -1,7 +1,6 @@ """Argparser for overriding config values""" import argparse -from typing import Optional from pydantic import BaseModel from common.config_models import TabbyConfigModel @@ -25,7 +24,7 @@ def add_field_to_group(group, field_name, field_type, field) -> None: def init_argparser( - existing_parser: Optional[argparse.ArgumentParser] = None, + existing_parser: argparse.ArgumentParser | None = None, ) -> argparse.ArgumentParser: """ Initializes an argparse parser based on a Pydantic config schema. diff --git a/common/auth.py b/common/auth.py index b02cdd02..c986fc40 100644 --- a/common/auth.py +++ b/common/auth.py @@ -10,7 +10,6 @@ from fastapi import Header, HTTPException, Request from pydantic import BaseModel from loguru import logger -from typing import Optional from common.utils import coalesce @@ -38,7 +37,7 @@ def verify_key(self, test_key: str, key_type: str): # Global auth constants -AUTH_KEYS: Optional[AuthKeys] = None +AUTH_KEYS: AuthKeys | None = None DISABLE_AUTH: bool = False @@ -52,7 +51,7 @@ async def load_auth_keys(disable_from_config: bool): if disable_from_config: logger.warning( "Disabling authentication makes your instance vulnerable. " - "Set the `disable_auth` flag to False in config.yml if you " + "set the `disable_auth` flag to False in config.yml if you " "want to share this instance with others." ) diff --git a/common/config_models.py b/common/config_models.py index 0e71734c..781acdb3 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -6,17 +6,17 @@ PrivateAttr, field_validator, ) -from typing import List, Literal, Optional, Union +from typing import Literal CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"] -CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")] +CACHE_TYPE = CACHE_SIZES | constr(pattern=r"^[2-8]\s*,\s*[2-8]$") class Metadata(BaseModel): """metadata model for config options""" - include_in_config: Optional[bool] = Field(True) + include_in_config: bool | None = Field(True) class BaseConfigModel(BaseModel): @@ -29,7 +29,7 @@ class ConfigOverrideConfig(BaseConfigModel): """Model for overriding a provided config file.""" # TODO: convert this to a pathlib.path? - config: Optional[str] = Field( + config: str | None = Field( None, description=("Path to an overriding config.yml file") ) @@ -39,17 +39,17 @@ class ConfigOverrideConfig(BaseConfigModel): class NetworkConfig(BaseConfigModel): """Options for networking""" - host: Optional[str] = Field( + host: str | None = Field( "127.0.0.1", description=( "The IP to host on (default: 127.0.0.1).\n" "Use 0.0.0.0 to expose on all network adapters." ), ) - port: Optional[int] = Field( + port: int | None = Field( 5000, description=("The port to host on (default: 5000).") ) - disable_auth: Optional[bool] = Field( + disable_auth: bool | None = Field( False, description=( "Disable HTTP token authentication with requests.\n" @@ -57,21 +57,21 @@ class NetworkConfig(BaseConfigModel): "Turn on this option if you are ONLY connecting from localhost." ), ) - disable_fetch_requests: Optional[bool] = Field( + disable_fetch_requests: bool | None = Field( False, description=( "Disable fetching external content in response to requests," "such as images from URLs." ), ) - send_tracebacks: Optional[bool] = Field( + send_tracebacks: bool | None = Field( False, description=( "Send tracebacks over the API (default: False).\n" "NOTE: Only enable this for debug purposes." ), ) - api_servers: Optional[List[Literal["oai", "kobold"]]] = Field( + api_servers: list[Literal["oai", "kobold"]] | None = Field( ["OAI"], description=( 'Select API servers to enable (default: ["OAI"]).\n' @@ -91,15 +91,15 @@ def api_server_validator(cls, api_servers): class LoggingConfig(BaseConfigModel): """Options for logging""" - log_prompt: Optional[bool] = Field( + log_prompt: bool | None = Field( False, description=("Enable prompt logging (default: False)."), ) - log_generation_params: Optional[bool] = Field( + log_generation_params: bool | None = Field( False, description=("Enable generation parameter logging (default: False)."), ) - log_requests: Optional[bool] = Field( + log_requests: bool | None = Field( False, description=( "Enable request logging (default: False).\n" @@ -123,7 +123,7 @@ class ModelConfig(BaseConfigModel): "Windows users, do NOT put this path in quotes!" ), ) - inline_model_loading: Optional[bool] = Field( + inline_model_loading: bool | None = Field( False, description=( "Allow direct loading of models " @@ -132,7 +132,7 @@ class ModelConfig(BaseConfigModel): "Enable dummy models to add exceptions for invalid model names." ), ) - use_dummy_models: Optional[bool] = Field( + use_dummy_models: bool | None = Field( False, description=( "Sends dummy model names when the models endpoint is queried. " @@ -140,7 +140,7 @@ class ModelConfig(BaseConfigModel): "Enable this if the client is looking for specific OAI models.\n" ), ) - dummy_model_names: List[str] = Field( + dummy_model_names: list[str] = Field( default=["gpt-3.5-turbo"], description=( "A list of fake model names that are sent via the /v1/models endpoint. " @@ -148,7 +148,7 @@ class ModelConfig(BaseConfigModel): "Also used as bypasses for strict mode if inline_model_loading is true." ), ) - model_name: Optional[str] = Field( + model_name: str | None = Field( None, description=( "An initial model to load.\n" @@ -156,7 +156,7 @@ class ModelConfig(BaseConfigModel): "REQUIRED: This must be filled out to load a model on startup." ), ) - use_as_default: List[str] = Field( + use_as_default: list[str] = Field( default_factory=list, description=( "Names of args to use as a fallback for API load requests (default: []).\n" @@ -165,22 +165,22 @@ class ModelConfig(BaseConfigModel): "Example: ['max_seq_len', 'cache_mode']." ), ) - backend: Optional[str] = Field( + backend: str | None = Field( None, description=( "Backend to use for this model (auto-detect if not specified)\n" "Options: exllamav2, exllamav3" ), ) - max_seq_len: Optional[int] = Field( + max_seq_len: int | None = Field( None, description=( "Max sequence length (default: 4096).\n" - "Set to -1 to fetch from the model's config.json" + "set to -1 to fetch from the model's config.json" ), ge=-1, ) - cache_size: Optional[int] = Field( + cache_size: int | None = Field( None, description=( "Size of the prompt cache to allocate (default: max_seq_len).\n" @@ -190,7 +190,7 @@ class ModelConfig(BaseConfigModel): multiple_of=256, gt=0, ) - cache_mode: Optional[CACHE_TYPE] = Field( + cache_mode: CACHE_TYPE | None = Field( "FP16", description=( "Enable different cache modes for VRAM savings (default: FP16).\n" @@ -199,7 +199,7 @@ class ModelConfig(BaseConfigModel): "are integers from 2-8 (i.e. 8,8)." ), ) - tensor_parallel: Optional[bool] = Field( + tensor_parallel: bool | None = Field( False, description=( "Load model with tensor parallelism (default: False).\n" @@ -207,7 +207,7 @@ class ModelConfig(BaseConfigModel): "This ignores the gpu_split_auto value." ), ) - tensor_parallel_backend: Optional[str] = Field( + tensor_parallel_backend: str | None = Field( "native", description=( "Sets a backend type for tensor parallelism. (default: native).\n" @@ -216,28 +216,28 @@ class ModelConfig(BaseConfigModel): "NCCL is recommended for NVLink." ), ) - gpu_split_auto: Optional[bool] = Field( + gpu_split_auto: bool | None = Field( True, description=( "Automatically allocate resources to GPUs (default: True).\n" "Not parsed for single GPU users." ), ) - autosplit_reserve: List[float] = Field( + autosplit_reserve: list[float] = Field( [96], description=( "Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n" "Represented as an array of MB per GPU." ), ) - gpu_split: List[float] = Field( + gpu_split: list[float] = Field( default_factory=list, description=( "An integer array of GBs of VRAM to split between GPUs (default: []).\n" "Used with tensor parallelism." ), ) - rope_scale: Optional[float] = Field( + rope_scale: float | None = Field( 1.0, description=( "Rope scale (default: 1.0).\n" @@ -246,16 +246,16 @@ class ModelConfig(BaseConfigModel): "Leave blank to pull the value from the model." ), ) - rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( + rope_alpha: float | Literal["auto"] | None = Field( None, description=( "Rope alpha (default: None).\n" - 'Same as alpha_value. Set to "auto" to auto-calculate.\n' + 'Same as alpha_value. set to "auto" to auto-calculate.\n' "Leaving this value blank will either pull from the model " "or auto-calculate." ), ) - chunk_size: Optional[int] = Field( + chunk_size: int | None = Field( 2048, description=( "Chunk size for prompt ingestion (default: 2048).\n" @@ -265,7 +265,7 @@ class ModelConfig(BaseConfigModel): ), gt=0, ) - output_chunking: Optional[bool] = Field( + output_chunking: bool | None = Field( True, description=( "Use output chunking (default: True)\n" @@ -274,27 +274,27 @@ class ModelConfig(BaseConfigModel): "Used by EXL3 models only.\n" ), ) - max_batch_size: Optional[int] = Field( + max_batch_size: int | None = Field( None, description=( - "Set the maximum number of prompts to process at one time " + "set the maximum number of prompts to process at one time " "(default: None/Automatic).\n" "Automatically calculated if left blank.\n" "NOTE: Only available for Nvidia ampere (30 series) and above GPUs." ), ge=1, ) - prompt_template: Optional[str] = Field( + prompt_template: str | None = Field( None, description=( - "Set the prompt template for this model. (default: None)\n" + "set the prompt template for this model. (default: None)\n" "If empty, attempts to look for the model's chat template.\n" "If a model contains multiple templates in its tokenizer_config.json,\n" "set prompt_template to the name of the template you want to use.\n" "NOTE: Only works with chat completion message lists!" ), ) - vision: Optional[bool] = Field( + vision: bool | None = Field( False, description=( "Enables vision support if the model supports it. (default: False)" @@ -312,18 +312,18 @@ class DraftModelConfig(BaseConfigModel): """ # TODO: convert this to a pathlib.path? - draft_model_dir: Optional[str] = Field( + draft_model_dir: str | None = Field( "models", description=("Directory to look for draft models (default: models)"), ) - draft_model_name: Optional[str] = Field( + draft_model_name: str | None = Field( None, description=( "An initial draft model to load.\n" "Ensure the model is in the model directory." ), ) - draft_rope_scale: Optional[float] = Field( + draft_rope_scale: float | None = Field( 1.0, description=( "Rope scale for draft models (default: 1.0).\n" @@ -331,23 +331,23 @@ class DraftModelConfig(BaseConfigModel): "Use if the draft model was trained on long context with rope." ), ) - draft_rope_alpha: Optional[float] = Field( + draft_rope_alpha: float | None = Field( None, description=( "Rope alpha for draft models (default: None).\n" - 'Same as alpha_value. Set to "auto" to auto-calculate.\n' + 'Same as alpha_value. set to "auto" to auto-calculate.\n' "Leaving this value blank will either pull from the model " "or auto-calculate." ), ) - draft_cache_mode: Optional[CACHE_SIZES] = Field( + draft_cache_mode: CACHE_SIZES | None = Field( "FP16", description=( "Cache mode for draft models to save VRAM (default: FP16).\n" f"Possible values: {str(CACHE_SIZES)[15:-1]}." ), ) - draft_gpu_split: List[float] = Field( + draft_gpu_split: list[float] = Field( default_factory=list, description=( "An integer array of GBs of VRAM to split between GPUs (default: []).\n" @@ -359,7 +359,7 @@ class DraftModelConfig(BaseConfigModel): class SamplingConfig(BaseConfigModel): """Options for Sampling""" - override_preset: Optional[str] = Field( + override_preset: str | None = Field( None, description=( "Select a sampler override preset (default: None).\n" @@ -376,7 +376,7 @@ class SamplingConfig(BaseConfigModel): class LoraInstanceModel(BaseConfigModel): """Model representing an instance of a Lora.""" - name: Optional[str] = None + name: str | None = None scaling: float = Field(1.0, ge=0) @@ -384,13 +384,13 @@ class LoraConfig(BaseConfigModel): """Options for Loras""" # TODO: convert this to a pathlib.path? - lora_dir: Optional[str] = Field( + lora_dir: str | None = Field( "loras", description=("Directory to look for LoRAs (default: loras).") ) - loras: Optional[List[LoraInstanceModel]] = Field( + loras: list[LoraInstanceModel] | None = Field( None, description=( - "List of LoRAs to load and associated scaling factors " + "list of LoRAs to load and associated scaling factors " "(default scale: 1.0).\n" "For the YAML file, add each entry as a YAML list:\n" "- name: lora1\n" @@ -407,11 +407,11 @@ class EmbeddingsConfig(BaseConfigModel): """ # TODO: convert this to a pathlib.path? - embedding_model_dir: Optional[str] = Field( + embedding_model_dir: str | None = Field( "models", description=("Directory to look for embedding models (default: models)."), ) - embeddings_device: Optional[Literal["cpu", "auto", "cuda"]] = Field( + embeddings_device: Literal["cpu", "auto", "cuda"] | None = Field( "cpu", description=( "Device to load embedding models on (default: cpu).\n" @@ -420,7 +420,7 @@ class EmbeddingsConfig(BaseConfigModel): "If using an AMD GPU, set this value to 'cuda'." ), ) - embedding_model_name: Optional[str] = Field( + embedding_model_name: str | None = Field( None, description=("An initial embedding model to load on the infinity backend."), ) @@ -429,7 +429,7 @@ class EmbeddingsConfig(BaseConfigModel): class DeveloperConfig(BaseConfigModel): """Options for development and experimentation""" - unsafe_launch: Optional[bool] = Field( + unsafe_launch: bool | None = Field( False, description=( "Skip Exllamav2 version check (default: False).\n" @@ -437,13 +437,13 @@ class DeveloperConfig(BaseConfigModel): "than enabling this flag." ), ) - disable_request_streaming: Optional[bool] = Field( + disable_request_streaming: bool | None = Field( False, description=("Disable API request streaming (default: False).") ) - realtime_process_priority: Optional[bool] = Field( + realtime_process_priority: bool | None = Field( False, description=( - "Set process to use a higher priority.\n" + "set process to use a higher priority.\n" "For realtime process priority, run as administrator or sudo.\n" "Otherwise, the priority will be set to high." ), @@ -453,27 +453,27 @@ class DeveloperConfig(BaseConfigModel): class TabbyConfigModel(BaseModel): """Base model for a TabbyConfig.""" - config: Optional[ConfigOverrideConfig] = Field( + config: ConfigOverrideConfig | None = Field( default_factory=ConfigOverrideConfig.model_construct ) - network: Optional[NetworkConfig] = Field( + network: NetworkConfig | None = Field( default_factory=NetworkConfig.model_construct ) - logging: Optional[LoggingConfig] = Field( + logging: LoggingConfig | None = Field( default_factory=LoggingConfig.model_construct ) - model: Optional[ModelConfig] = Field(default_factory=ModelConfig.model_construct) - draft_model: Optional[DraftModelConfig] = Field( + model: ModelConfig | None = Field(default_factory=ModelConfig.model_construct) + draft_model: DraftModelConfig | None = Field( default_factory=DraftModelConfig.model_construct ) - lora: Optional[LoraConfig] = Field(default_factory=LoraConfig.model_construct) - embeddings: Optional[EmbeddingsConfig] = Field( + lora: LoraConfig | None = Field(default_factory=LoraConfig.model_construct) + embeddings: EmbeddingsConfig | None = Field( default_factory=EmbeddingsConfig.model_construct ) - sampling: Optional[SamplingConfig] = Field( + sampling: SamplingConfig | None = Field( default_factory=SamplingConfig.model_construct ) - developer: Optional[DeveloperConfig] = Field( + developer: DeveloperConfig | None = Field( default_factory=DeveloperConfig.model_construct ) diff --git a/common/downloader.py b/common/downloader.py index 8307bbcd..8f3155f2 100644 --- a/common/downloader.py +++ b/common/downloader.py @@ -10,7 +10,6 @@ from fnmatch import fnmatch from loguru import logger from rich.progress import Progress -from typing import List, Optional from common.logger import get_progress_bar from common.tabby_config import config @@ -27,7 +26,7 @@ class RepoItem: async def _download_file( session: aiohttp.ClientSession, repo_item: RepoItem, - token: Optional[str], + token: str | None, download_path: pathlib.Path, chunk_limit: int, progress: Progress, @@ -92,7 +91,7 @@ def _get_repo_info(repo_id, revision, token): ] -def _get_download_folder(repo_id: str, repo_type: str, folder_name: Optional[str]): +def _get_download_folder(repo_id: str, repo_type: str, folder_name: str | None): """Gets the download folder for the repo.""" if repo_type == "lora": @@ -105,7 +104,7 @@ def _get_download_folder(repo_id: str, repo_type: str, folder_name: Optional[str def _check_exclusions( - filename: str, include_patterns: List[str], exclude_patterns: List[str] + filename: str, include_patterns: list[str], exclude_patterns: list[str] ): include_result = any(fnmatch(filename, pattern) for pattern in include_patterns) exclude_result = any(fnmatch(filename, pattern) for pattern in exclude_patterns) @@ -115,14 +114,14 @@ def _check_exclusions( async def hf_repo_download( repo_id: str, - folder_name: Optional[str], - revision: Optional[str], - token: Optional[str], - include: Optional[List[str]], - exclude: Optional[List[str]], - chunk_limit: Optional[float] = None, - timeout: Optional[int] = None, - repo_type: Optional[str] = "model", + folder_name: str | None, + revision: str | None, + token: str | None, + include: list[str] | None, + exclude: list[str] | None, + chunk_limit: float | None = None, + timeout: int | None = None, + repo_type: str | None = "model", ): """Gets a repo's information from HuggingFace and downloads it locally.""" diff --git a/common/gen_logging.py b/common/gen_logging.py index fcd3c01d..6cf5ddcb 100644 --- a/common/gen_logging.py +++ b/common/gen_logging.py @@ -3,7 +3,6 @@ """ from loguru import logger -from typing import Optional from common.tabby_config import config @@ -29,7 +28,7 @@ def log_generation_params(**kwargs): logger.info(f"Generation options: {kwargs}\n") -def log_prompt(prompt: str, request_id: str, negative_prompt: Optional[str] = None): +def log_prompt(prompt: str, request_id: str, negative_prompt: str | None = None): """Logs the prompt to console.""" if config.logging.log_prompt: formatted_prompt = "\n" + prompt @@ -55,7 +54,7 @@ def log_response(request_id: str, response: str): def log_metrics( request_id: str, metrics: dict, - context_len: Optional[int], + context_len: int | None, max_seq_len: int, ): initial_response = ( diff --git a/common/health.py b/common/health.py index 4d21d6af..a31cb689 100644 --- a/common/health.py +++ b/common/health.py @@ -3,7 +3,6 @@ from datetime import datetime, timezone from functools import partial from pydantic import BaseModel, Field -from typing import Union class UnhealthyEvent(BaseModel): @@ -24,7 +23,7 @@ def __init__(self): self.issues: deque[UnhealthyEvent] = deque(maxlen=100) self._lock = asyncio.Lock() - async def add_unhealthy_event(self, error: Union[str, Exception]): + async def add_unhealthy_event(self, error: str | Exception): """Add a new unhealthy event""" async with self._lock: if isinstance(error, Exception): diff --git a/common/model.py b/common/model.py index 4ac4861a..757688e8 100644 --- a/common/model.py +++ b/common/model.py @@ -10,7 +10,6 @@ from fastapi import HTTPException from loguru import logger from ruamel.yaml import YAML -from typing import Dict, Optional from backends.base_model_container import BaseModelContainer from common.logger import get_loading_progress_bar @@ -21,11 +20,11 @@ from common.utils import deep_merge_dict, unwrap # Global variables for model container -container: Optional[BaseModelContainer] = None +container: BaseModelContainer | None = None embeddings_container = None -_BACKEND_REGISTRY: Dict[str, BaseModelContainer] = {} +_BACKEND_REGISTRY: dict[str, BaseModelContainer] = {} if dependencies.exllamav2: from backends.exllamav2.model import ExllamaV2Container @@ -42,7 +41,7 @@ if dependencies.extras: from backends.infinity.model import InfinityContainer - embeddings_container: Optional[InfinityContainer] = None + embeddings_container: InfinityContainer | None = None class ModelType(Enum): diff --git a/common/multimodal.py b/common/multimodal.py index b92386f3..11e401dc 100644 --- a/common/multimodal.py +++ b/common/multimodal.py @@ -3,7 +3,6 @@ from common import model from loguru import logger from pydantic import BaseModel, Field -from typing import List from common.optional_dependencies import dependencies @@ -18,7 +17,7 @@ class MultimodalEmbeddingWrapper(BaseModel): type: str = None content: list = Field(default_factory=list) - text_alias: List[str] = Field(default_factory=list) + text_alias: list[str] = Field(default_factory=list) async def add(self, url: str): # Determine the type of vision embedding to use diff --git a/common/networking.py b/common/networking.py index 597ed078..f4d72917 100644 --- a/common/networking.py +++ b/common/networking.py @@ -7,7 +7,6 @@ from fastapi import Depends, HTTPException, Request from loguru import logger from pydantic import BaseModel -from typing import Optional from uuid import uuid4 from common.tabby_config import config @@ -17,7 +16,7 @@ class TabbyRequestErrorMessage(BaseModel): """Common request error type.""" message: str - trace: Optional[str] = None + trace: str | None = None class TabbyRequestError(BaseModel): diff --git a/common/sampling.py b/common/sampling.py index 49be5b99..832661ed 100644 --- a/common/sampling.py +++ b/common/sampling.py @@ -14,7 +14,6 @@ field_validator, model_validator, ) -from typing import Dict, List, Optional, Union from common.utils import filter_none_values, unwrap @@ -23,7 +22,7 @@ class BaseSamplerRequest(BaseModel): """Common class for sampler params that are used in APIs""" - max_tokens: Optional[int] = Field( + max_tokens: int | None = Field( default_factory=lambda: get_default_sampler_value("max_tokens"), validation_alias=AliasChoices( "max_tokens", "max_completion_tokens", "max_length" @@ -33,7 +32,7 @@ class BaseSamplerRequest(BaseModel): ge=0, ) - min_tokens: Optional[int] = Field( + min_tokens: int | None = Field( default_factory=lambda: get_default_sampler_value("min_tokens", 0), validation_alias=AliasChoices("min_tokens", "min_length"), description="Aliases: min_length", @@ -41,76 +40,76 @@ class BaseSamplerRequest(BaseModel): ge=0, ) - stop: Optional[Union[str, List[Union[str, int]]]] = Field( + stop: str | list[str | int] | None = Field( default_factory=lambda: get_default_sampler_value("stop", []), validation_alias=AliasChoices("stop", "stop_sequence"), description="Aliases: stop_sequence", ) - banned_strings: Optional[Union[str, List[str]]] = Field( + banned_strings: str | list[str] | None = Field( default_factory=lambda: get_default_sampler_value("banned_strings", []) ) - banned_tokens: Optional[Union[List[int], str]] = Field( + banned_tokens: list[int] | str | None = Field( default_factory=lambda: get_default_sampler_value("banned_tokens", []), validation_alias=AliasChoices("banned_tokens", "custom_token_bans"), description="Aliases: custom_token_bans", examples=[[128, 330]], ) - allowed_tokens: Optional[Union[List[int], str]] = Field( + allowed_tokens: list[int] | str | None = Field( default_factory=lambda: get_default_sampler_value("allowed_tokens", []), validation_alias=AliasChoices("allowed_tokens", "allowed_token_ids"), description="Aliases: allowed_token_ids", examples=[[128, 330]], ) - token_healing: Optional[bool] = Field( + token_healing: bool | None = Field( default_factory=lambda: get_default_sampler_value("token_healing", False) ) - temperature: Optional[float] = Field( + temperature: float | None = Field( default_factory=lambda: get_default_sampler_value("temperature", 1.0), examples=[1.0], ge=0, le=10, ) - temperature_last: Optional[bool] = Field( + temperature_last: bool | None = Field( default_factory=lambda: get_default_sampler_value("temperature_last", False), ) - smoothing_factor: Optional[float] = Field( + smoothing_factor: float | None = Field( default_factory=lambda: get_default_sampler_value("smoothing_factor", 0.0), ge=0, ) - top_k: Optional[int] = Field( + top_k: int | None = Field( default_factory=lambda: get_default_sampler_value("top_k", 0), ge=-1, ) - top_p: Optional[float] = Field( + top_p: float | None = Field( default_factory=lambda: get_default_sampler_value("top_p", 1.0), ge=0, le=1, examples=[1.0], ) - top_a: Optional[float] = Field( + top_a: float | None = Field( default_factory=lambda: get_default_sampler_value("top_a", 0.0) ) - min_p: Optional[float] = Field( + min_p: float | None = Field( default_factory=lambda: get_default_sampler_value("min_p", 0.0) ) - tfs: Optional[float] = Field( + tfs: float | None = Field( default_factory=lambda: get_default_sampler_value("tfs", 1.0), examples=[1.0], ) - typical: Optional[float] = Field( + typical: float | None = Field( default_factory=lambda: get_default_sampler_value("typical", 1.0), validation_alias=AliasChoices("typical", "typical_p"), description="Aliases: typical_p", @@ -119,30 +118,30 @@ class BaseSamplerRequest(BaseModel): le=1, ) - skew: Optional[float] = Field( + skew: float | None = Field( default_factory=lambda: get_default_sampler_value("skew", 0.0), examples=[0.0], ) - xtc_probability: Optional[float] = Field( + xtc_probability: float | None = Field( default_factory=lambda: get_default_sampler_value("xtc_probability", 0.0), ) - xtc_threshold: Optional[float] = Field( + xtc_threshold: float | None = Field( default_factory=lambda: get_default_sampler_value("xtc_threshold", 0.1) ) - frequency_penalty: Optional[float] = Field( + frequency_penalty: float | None = Field( default_factory=lambda: get_default_sampler_value("frequency_penalty", 0.0), ge=0, ) - presence_penalty: Optional[float] = Field( + presence_penalty: float | None = Field( default_factory=lambda: get_default_sampler_value("presence_penalty", 0.0), ge=0, ) - repetition_penalty: Optional[float] = Field( + repetition_penalty: float | None = Field( default_factory=lambda: get_default_sampler_value("repetition_penalty", 1.0), validation_alias=AliasChoices("repetition_penalty", "rep_pen"), description="Aliases: rep_pen", @@ -150,7 +149,7 @@ class BaseSamplerRequest(BaseModel): gt=0, ) - penalty_range: Optional[int] = Field( + penalty_range: int | None = Field( default_factory=lambda: get_default_sampler_value("penalty_range", -1), validation_alias=AliasChoices( "penalty_range", @@ -163,91 +162,91 @@ class BaseSamplerRequest(BaseModel): ), ) - repetition_decay: Optional[int] = Field( + repetition_decay: int | None = Field( default_factory=lambda: get_default_sampler_value("repetition_decay", 0) ) - dry_multiplier: Optional[float] = Field( + dry_multiplier: float | None = Field( default_factory=lambda: get_default_sampler_value("dry_multiplier", 0.0) ) - dry_base: Optional[float] = Field( + dry_base: float | None = Field( default_factory=lambda: get_default_sampler_value("dry_base", 0.0) ) - dry_allowed_length: Optional[int] = Field( + dry_allowed_length: int | None = Field( default_factory=lambda: get_default_sampler_value("dry_allowed_length", 0) ) - dry_range: Optional[int] = Field( + dry_range: int | None = Field( default_factory=lambda: get_default_sampler_value("dry_range", 0), validation_alias=AliasChoices("dry_range", "dry_penalty_last_n"), description=("Aliases: dry_penalty_last_n"), ) - dry_sequence_breakers: Optional[Union[str, List[str]]] = Field( + dry_sequence_breakers: str | list[str] | None = Field( default_factory=lambda: get_default_sampler_value("dry_sequence_breakers", []) ) - mirostat_mode: Optional[int] = Field( + mirostat_mode: int | None = Field( default_factory=lambda: get_default_sampler_value("mirostat_mode", 0), alias=AliasChoices("mirostat_mode", "mirostat"), ) - mirostat_tau: Optional[float] = Field( + mirostat_tau: float | None = Field( default_factory=lambda: get_default_sampler_value("mirostat_tau", 1.5), examples=[1.5], ) - mirostat_eta: Optional[float] = Field( + mirostat_eta: float | None = Field( default_factory=lambda: get_default_sampler_value("mirostat_eta", 0.3), examples=[0.3], ) - add_bos_token: Optional[bool] = Field( + add_bos_token: bool | None = Field( default_factory=lambda: get_default_sampler_value("add_bos_token") ) - ban_eos_token: Optional[bool] = Field( + ban_eos_token: bool | None = Field( default_factory=lambda: get_default_sampler_value("ban_eos_token", False), validation_alias=AliasChoices("ban_eos_token", "ignore_eos"), description="Aliases: ignore_eos", examples=[False], ) - logit_bias: Optional[Dict[int, float]] = Field( + logit_bias: dict[int, float] | None = Field( default_factory=lambda: get_default_sampler_value("logit_bias"), examples=[{"1": 10, "2": 50}], ) - negative_prompt: Optional[str] = Field( + negative_prompt: str | None = Field( default_factory=lambda: get_default_sampler_value("negative_prompt") ) - json_schema: Optional[object] = Field( + json_schema: object | None = Field( default_factory=lambda: get_default_sampler_value("json_schema"), ) - regex_pattern: Optional[str] = Field( + regex_pattern: str | None = Field( default_factory=lambda: get_default_sampler_value("regex_pattern"), ) - grammar_string: Optional[str] = Field( + grammar_string: str | None = Field( default_factory=lambda: get_default_sampler_value("grammar_string"), ) - speculative_ngram: Optional[bool] = Field( + speculative_ngram: bool | None = Field( default_factory=lambda: get_default_sampler_value("speculative_ngram"), ) - cfg_scale: Optional[float] = Field( + cfg_scale: float | None = Field( default_factory=lambda: get_default_sampler_value("cfg_scale", 1.0), validation_alias=AliasChoices("cfg_scale", "guidance_scale"), description="Aliases: guidance_scale", examples=[1.0], ) - max_temp: Optional[float] = Field( + max_temp: float | None = Field( default_factory=lambda: get_default_sampler_value("max_temp", 1.0), validation_alias=AliasChoices("max_temp", "dynatemp_high"), description="Aliases: dynatemp_high", @@ -255,7 +254,7 @@ class BaseSamplerRequest(BaseModel): ge=0, ) - min_temp: Optional[float] = Field( + min_temp: float | None = Field( default_factory=lambda: get_default_sampler_value("min_temp", 1.0), validation_alias=AliasChoices("min_temp", "dynatemp_low"), description="Aliases: dynatemp_low", @@ -263,14 +262,14 @@ class BaseSamplerRequest(BaseModel): ge=0, ) - temp_exponent: Optional[float] = Field( + temp_exponent: float | None = Field( default_factory=lambda: get_default_sampler_value("temp_exponent", 1.0), validation_alias=AliasChoices("temp_exponent", "dynatemp_exponent"), examples=[1.0], ge=0, ) - logprobs: Optional[int] = Field( + logprobs: int | None = Field( default_factory=lambda: get_default_sampler_value("logprobs", 0), ge=0, ) @@ -335,7 +334,7 @@ def after_validate(self): class SamplerOverridesContainer(BaseModel): - selected_preset: Optional[str] = None + selected_preset: str | None = None overrides: dict = {} diff --git a/common/tabby_config.py b/common/tabby_config.py index 9c4cc5d6..212746f1 100644 --- a/common/tabby_config.py +++ b/common/tabby_config.py @@ -2,7 +2,6 @@ from inspect import getdoc from os import getenv from textwrap import dedent -from typing import Optional from loguru import logger from pydantic import BaseModel @@ -22,7 +21,7 @@ class TabbyConfig(TabbyConfigModel): model_defaults: dict = {} draft_model_defaults: dict = {} - def load(self, arguments: Optional[dict] = None): + def load(self, arguments: dict | None = None): """Synchronously loads the global application config""" # config is applied in order of items in the list @@ -45,7 +44,7 @@ def load(self, arguments: Optional[dict] = None): value = getattr(merged_config_model, field) setattr(self, field, value) - # Set model defaults dict once to prevent on-demand reconstruction + # set model defaults dict once to prevent on-demand reconstruction # TODO: clean this up a bit for field in self.model.use_as_default: if hasattr(self.model, field): diff --git a/common/templating.py b/common/templating.py index cc0cceb1..864ee337 100644 --- a/common/templating.py +++ b/common/templating.py @@ -7,7 +7,6 @@ from dataclasses import dataclass, field from datetime import datetime from importlib.metadata import version as package_version -from typing import List, Optional from jinja2 import Template, TemplateError from jinja2.ext import loopcontrols from jinja2.sandbox import ImmutableSandboxedEnvironment @@ -28,8 +27,8 @@ class TemplateLoadError(Exception): class TemplateMetadata: """Represents the parsed metadata from a template.""" - stop_strings: List[str] = field(default_factory=list) - tool_start: Optional[str] = None + stop_strings: list[str] = field(default_factory=list) + tool_start: str | None = None class PromptTemplate: @@ -44,7 +43,7 @@ class PromptTemplate: enable_async=True, extensions=[loopcontrols], ) - metadata: Optional[TemplateMetadata] = None + metadata: TemplateMetadata | None = None async def extract_metadata(self, template_vars: dict): """ @@ -145,7 +144,7 @@ async def from_file(cls, template_path: pathlib.Path): @classmethod async def from_model_json( - cls, json_path: pathlib.Path, key: str, name: Optional[str] = None + cls, json_path: pathlib.Path, key: str, name: str | None = None ): """Get a template from a JSON file. Requires a key and template name""" if not json_path.exists(): diff --git a/common/transformers_utils.py b/common/transformers_utils.py index a7b0f0c1..92441564 100644 --- a/common/transformers_utils.py +++ b/common/transformers_utils.py @@ -3,7 +3,6 @@ import pathlib from loguru import logger from pydantic import BaseModel -from typing import Dict, List, Optional, Set, Union class GenerationConfig(BaseModel): @@ -12,7 +11,7 @@ class GenerationConfig(BaseModel): Will be expanded as needed. """ - eos_token_id: Optional[Union[int, List[int]]] = None + eos_token_id: int | list[int] | None = None @classmethod async def from_directory(cls, model_directory: pathlib.Path): @@ -44,8 +43,8 @@ class HuggingFaceConfig(BaseModel): """ max_position_embeddings: int = 4096 - eos_token_id: Optional[Union[int, List[int]]] = None - quantization_config: Optional[Dict] = None + eos_token_id: int | list[int] | None = None + quantization_config: dict | None = None @classmethod async def from_directory(cls, model_directory: pathlib.Path): @@ -62,7 +61,7 @@ async def from_directory(cls, model_directory: pathlib.Path): def quant_method(self): """Wrapper method to fetch quant type""" - if isinstance(self.quantization_config, Dict): + if isinstance(self.quantization_config, dict): return self.quantization_config.get("quant_method") else: return None @@ -83,7 +82,7 @@ class TokenizerConfig(BaseModel): An abridged version of HuggingFace's tokenizer config. """ - add_bos_token: Optional[bool] = True + add_bos_token: bool | None = True @classmethod async def from_directory(cls, model_directory: pathlib.Path): @@ -111,8 +110,8 @@ class HFModel: """ hf_config: HuggingFaceConfig - tokenizer_config: Optional[TokenizerConfig] = None - generation_config: Optional[GenerationConfig] = None + tokenizer_config: TokenizerConfig | None = None + generation_config: GenerationConfig | None = None @classmethod async def from_directory(cls, model_directory: pathlib.Path): @@ -156,7 +155,7 @@ def quant_method(self): def eos_tokens(self): """Combines and returns EOS tokens from various configs""" - eos_ids: Set[int] = set() + eos_ids: set[int] = set() eos_ids.update(self.hf_config.eos_tokens()) diff --git a/common/utils.py b/common/utils.py index b0d7ad24..da841799 100644 --- a/common/utils.py +++ b/common/utils.py @@ -1,12 +1,12 @@ """Common utility functions""" -from types import NoneType -from typing import Dict, Optional, Type, Union, get_args, get_origin, TypeVar +from types import NoneType, UnionType +from typing import Type, get_args, get_origin, TypeVar T = TypeVar("T") -def unwrap(wrapped: Optional[T], default: T = None) -> T: +def unwrap(wrapped: T | None, default: T = None) -> T: """Unwrap function for Optionals.""" if wrapped is None: return default @@ -19,7 +19,7 @@ def coalesce(*args): return next((arg for arg in args if arg is not None), None) -def filter_none_values(collection: Union[dict, list]) -> Union[dict, list]: +def filter_none_values(collection: dict | list) -> dict | list: """Remove None values from a collection.""" if isinstance(collection, dict): @@ -32,7 +32,7 @@ def filter_none_values(collection: Union[dict, list]) -> Union[dict, list]: return collection -def deep_merge_dict(dict1: Dict, dict2: Dict, copy: bool = False) -> Dict: +def deep_merge_dict(dict1: dict, dict2: dict, copy: bool = False) -> dict: """ Merge 2 dictionaries. If copy is true, the original dictionary isn't modified. """ @@ -49,7 +49,7 @@ def deep_merge_dict(dict1: Dict, dict2: Dict, copy: bool = False) -> Dict: return dict1 -def deep_merge_dicts(*dicts: Dict) -> Dict: +def deep_merge_dicts(*dicts: dict) -> dict: """ Merge an arbitrary amount of dictionaries. We wanna do in-place modification for each level, so do not copy. @@ -84,11 +84,13 @@ def is_list_type(type_hint) -> bool: def unwrap_optional_type(type_hint) -> Type: """ - Unwrap Optional[type] annotations. + Unwrap type | None annotations to extract the base type. This is not the same as unwrap. """ - if get_origin(type_hint) is Union: + origin = get_origin(type_hint) + + if origin is UnionType: args = get_args(type_hint) if NoneType in args: for arg in args: diff --git a/config_sample.yml b/config_sample.yml index 0b65f9e8..88fa308a 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -79,7 +79,7 @@ model: backend: # Max sequence length (default: min(max_position_embeddings, cache_size)). - # Set to -1 to fetch from the model's config.json + # set to -1 to fetch from the model's config.json max_seq_len: # Size of the key/value cache to allocate, in tokens (default: 4096). @@ -126,7 +126,7 @@ model: rope_scale: 1.0 # Rope alpha (default: None). - # Same as alpha_value. Set to "auto" to auto-calculate. + # Same as alpha_value. set to "auto" to auto-calculate. # Leaving this value blank will either pull from the model or auto-calculate. rope_alpha: @@ -141,12 +141,12 @@ model: # Used by EXL3 models only. output_chunking: true - # Set the maximum number of prompts to process at one time (default: None/Automatic). + # set the maximum number of prompts to process at one time (default: None/Automatic). # Automatically calculated if left blank. # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. max_batch_size: - # Set the prompt template for this model. (default: None) + # set the prompt template for this model. (default: None) # If empty, attempts to look for the model's chat template. # If a model contains multiple templates in its tokenizer_config.json, # set prompt_template to the name of the template you want to use. @@ -172,7 +172,7 @@ draft_model: draft_rope_scale: 1.0 # Rope alpha for draft models (default: None). - # Same as alpha_value. Set to "auto" to auto-calculate. + # Same as alpha_value. set to "auto" to auto-calculate. # Leaving this value blank will either pull from the model or auto-calculate. draft_rope_alpha: @@ -199,7 +199,7 @@ lora: # Directory to look for LoRAs (default: loras). lora_dir: loras - # List of LoRAs to load and associated scaling factors (default scale: 1.0). + # list of LoRAs to load and associated scaling factors (default scale: 1.0). # For the YAML file, add each entry as a YAML list: # - name: lora1 # scaling: 1.0 @@ -230,7 +230,7 @@ developer: # Disable API request streaming (default: False). disable_request_streaming: false - # Set process to use a higher priority. + # set process to use a higher priority. # For realtime process priority, run as administrator or sudo. # Otherwise, the priority will be set to high. realtime_process_priority: false diff --git a/docker/Dockerfile b/docker/Dockerfile index 58aa61f5..ba038cec 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,7 +20,7 @@ ENV PATH="/opt/venv/bin:$PATH" # Upgrade pip RUN pip install --no-cache-dir --upgrade pip -# Set the working directory in the container +# set the working directory in the container WORKDIR /app # Get requirements @@ -37,7 +37,7 @@ COPY . . # Make port 5000 available to the world outside this container EXPOSE 5000 -# Set the entry point +# set the entry point ENTRYPOINT ["python3"] # Run main.py when the container launches diff --git a/docs/01.-Getting-Started.md b/docs/01.-Getting-Started.md index 330116c6..7e74abd9 100644 --- a/docs/01.-Getting-Started.md +++ b/docs/01.-Getting-Started.md @@ -171,7 +171,7 @@ These are short-form instructions for other methods that users can use to instal 2. For Windows: [Cuda Toolkit on WSL](https://docs.nvidia.com/cuda/wsl-user-guide/index.html) 3. Clone TabbyAPI via `git clone https://github.com/theroyallab/tabbyAPI` 4. Enter the tabbyAPI directory by `cd tabbyAPI`. - 1. Optional: Set up a config.yml or api_tokens.yml ([configuration](#configuration)) + 1. Optional: set up a config.yml or api_tokens.yml ([configuration](#configuration)) 5. Update the volume mount section in the `docker/docker-compose.yml` file ```yml volumes: diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md index 98cee556..03f866ad 100644 --- a/docs/02.-Server-options.md +++ b/docs/02.-Server-options.md @@ -16,12 +16,12 @@ All of these options have descriptive comments above them. You should not need t | Config Option | Type (Default) | Description | | ---------------------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| host | String (127.0.0.1) | Set the IP address used for hosting TabbyAPI | -| port | Int (5000) | Set the TCP Port use for TabbyAPI | +| host | String (127.0.0.1) | set the IP address used for hosting TabbyAPI | +| port | Int (5000) | set the TCP Port use for TabbyAPI | | disable_auth | Bool (False) | Disables API authentication | | disable_fetch_requests | Bool (False) | Disables fetching external content when responding to requests (ex. fetching images from URLs) | | send_tracebacks | Bool (False) | Send server tracebacks to client.

Note: It's not recommended to enable this if sharing the instance with others. | -| api_servers | List[String] (["OAI"]) | API servers to enable. Possible values `"OAI", "Kobold"` | +| api_servers | list[String] (["OAI"]) | API servers to enable. Possible values `"OAI", "Kobold"` | ### Logging Options @@ -47,7 +47,7 @@ Note: These are experimental flags that may be removed at any point. | ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | unsafe_launch | Bool (False) | Skips dependency checks on startup. Only recommended for debugging. | | disable_request_streaming | Bool (False) | Forcefully disables streaming requests | -| realtime_process_priority | Bool (False) | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. | +| realtime_process_priority | Bool (False) | set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. | ### Model Options @@ -58,14 +58,14 @@ Note: Most of the options here will only apply on initial model load/startup (ep | model_dir | String ("models") | Directory to look for models.

Note: Persisted across subsequent load requests | | inline_model_loading | Bool (False) | Enables ability to switch models using the `model` argument in a generation request. More info in [Usage](https://github.com/theroyallab/tabbyAPI/wiki/03.-Usage#inline-loading) | | use_dummy_models | Bool (False) | Send a dummy OAI model card when calling the `/v1/models` endpoint. Used for clients which enforce specific OAI models.

Note: Persisted across subsequent load requests | -| dummy_model_names | List[String] (["gpt-3.5-turbo"]) | List of dummy names to send on model endpoint requests | +| dummy_model_names | list[String] (["gpt-3.5-turbo"]) | list of dummy names to send on model endpoint requests | | model_name | String (None) | Folder name of a model to load. The below parameters will not apply unless this is filled out. | -| use_as_default | List[String] ([]) | Keys to use by default when loading models. For example, putting `cache_mode` in this array will make every model load with that value unless specified by the API request.

Note: Also applies to the `draft` sub-block | +| use_as_default | list[String] ([]) | Keys to use by default when loading models. For example, putting `cache_mode` in this array will make every model load with that value unless specified by the API request.

Note: Also applies to the `draft` sub-block | | max_seq_len | Float (None) | Maximum sequence length of the model. Uses the value from config.json if not specified here. Also called the max context length. | | tensor_parallel | Bool (False) | Enables tensor parallelism. Automatically falls back to autosplit if GPU split isn't provided.

Note: `gpu_split_auto` is ignored when this is enabled. | | gpu_split_auto | Bool (True) | Automatically split the model across multiple GPUs. Manual GPU split isn't used if this is enabled. | -| autosplit_reserve | List[Int] ([96]) | Amount of empty VRAM to reserve when loading with autosplit.

Represented as an array of MB per GPU used. | -| gpu_split | List[Float] ([]) | Float array of GBs to split a model between GPUs. | +| autosplit_reserve | list[Int] ([96]) | Amount of empty VRAM to reserve when loading with autosplit.

Represented as an array of MB per GPU used. | +| gpu_split | list[Float] ([]) | Float array of GBs to split a model between GPUs. | | rope_scale | Float (1.0) | Adjustment for rope scale (or compress_pos_emb)

Note: If the model has YaRN support, this option will not apply. | | rope_alpha | Float (None) | Adjustment for rope alpha. Leave blank to automatically calculate based on the max_seq_len.

Note: If the model has YaRN support, this option will not apply. | | cache_mode | String ("FP16") | Cache mode for the model.

Options: FP16, Q8, Q6, Q4 | @@ -86,7 +86,7 @@ Note: Sub-block of Model Options. Same rules apply. | draft_rope_scale | Float (1.0) | String: RoPE scale value for the draft model. | | draft_rope_alpha | Float (1.0) | RoPE alpha value for the draft model. Leave blank for auto-calculation. | | draft_cache_mode | String ("FP16") | Cache mode for the draft model.

Options: FP16, Q8, Q6, Q4 | -| draft_gpu_split | List[Float] ([]) | Float array of GBs to split a draft model between GPUs. | +| draft_gpu_split | list[Float] ([]) | Float array of GBs to split a draft model between GPUs. | ### Lora Options @@ -95,7 +95,7 @@ Note: Sub-block of Mode Options. Same rules apply. | Config Option | Type (Default) | Description | |---------------|------------------|--------------------------------------------------------------| | lora_dir | String ("loras") | Directory to look for loras.

Note: Persisted across subsequent load requests | -| loras | List[loras] ([]) | List of lora objects to apply to the model. Each object contains a name and scaling. | +| loras | list[loras] ([]) | list of lora objects to apply to the model. Each object contains a name and scaling. | | name | String (None) | Folder name of a lora to load.

Note: An element of the `loras` key | | scaling | Float (1.0) | "Weight" to apply the lora on the parent model. For example, applying a lora with 0.9 scaling will lower the amount of application on the parent model.

Note: An element of the `loras` key | diff --git a/docs/06.-Sharing.md b/docs/06.-Sharing.md index 86288d80..a9f69559 100644 --- a/docs/06.-Sharing.md +++ b/docs/06.-Sharing.md @@ -28,7 +28,7 @@ Tailscale is a product that uses the WireGuard protocol to provide a mesh networ > This is not a method for exposing your TabbyAPI instance to the world. If you want that, use the other two services. To get started: -1. Set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine. +1. set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine. 2. Sign up and get started on [Tailscale's website](https://tailscale.com/), then install the client. 3. Connect to your tailscale account on both your host and client machine. 4. Select the Tailscale icon (usually in the system tray) and get the IP of your host device. This is usually identified by the hostname. diff --git a/endpoints/Kobold/types/generation.py b/endpoints/Kobold/types/generation.py index 5432130b..a58bceba 100644 --- a/endpoints/Kobold/types/generation.py +++ b/endpoints/Kobold/types/generation.py @@ -1,6 +1,5 @@ from functools import partial from pydantic import BaseModel, Field, field_validator -from typing import List, Optional from common.sampling import BaseSamplerRequest, get_default_sampler_value from common.utils import unwrap @@ -8,9 +7,9 @@ class GenerateRequest(BaseSamplerRequest): prompt: str - genkey: Optional[str] = None - use_default_badwordsids: Optional[bool] = False - dynatemp_range: Optional[float] = Field( + genkey: str | None = None + use_default_badwordsids: bool | None = False + dynatemp_range: float | None = Field( default_factory=partial(get_default_sampler_value, "dynatemp_range") ) @@ -43,7 +42,7 @@ class GenerateResponseResult(BaseModel): class GenerateResponse(BaseModel): - results: List[GenerateResponseResult] = Field(default_factory=list) + results: list[GenerateResponseResult] = Field(default_factory=list) class StreamGenerateChunk(BaseModel): diff --git a/endpoints/Kobold/types/token.py b/endpoints/Kobold/types/token.py index e6639d94..3f5a9fe6 100644 --- a/endpoints/Kobold/types/token.py +++ b/endpoints/Kobold/types/token.py @@ -1,5 +1,4 @@ from pydantic import BaseModel -from typing import List class TokenCountRequest(BaseModel): @@ -12,4 +11,4 @@ class TokenCountResponse(BaseModel): """Represents a KAI tokenization response.""" value: int - ids: List[int] + ids: list[int] diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py index 8f4e7a4e..bb64a48a 100644 --- a/endpoints/OAI/router.py +++ b/endpoints/OAI/router.py @@ -72,7 +72,7 @@ async def completion_request( disable_request_streaming = config.developer.disable_request_streaming - # Set an empty JSON schema if the request wants a JSON response + # set an empty JSON schema if the request wants a JSON response if data.response_format.type == "json": data.json_schema = {"type": "object"} @@ -125,7 +125,7 @@ async def chat_completion_request( prompt, embeddings = await apply_chat_template(data) - # Set an empty JSON schema if the request wants a JSON response + # set an empty JSON schema if the request wants a JSON response if data.response_format.type == "json": data.json_schema = {"type": "object"} diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py index 52523149..86118463 100644 --- a/endpoints/OAI/types/chat_completion.py +++ b/endpoints/OAI/types/chat_completion.py @@ -1,6 +1,8 @@ +from __future__ import annotations + from pydantic import AliasChoices, BaseModel, Field, field_validator from time import time -from typing import Literal, Union, List, Optional, Dict +from typing import Literal from uuid import uuid4 from endpoints.OAI.types.common import UsageStats, CommonCompletionRequest @@ -10,11 +12,11 @@ class ChatCompletionLogprob(BaseModel): token: str logprob: float - top_logprobs: Optional[List["ChatCompletionLogprob"]] = Field(default_factory=list) + top_logprobs: list[ChatCompletionLogprob] | None = Field(default_factory=list) class ChatCompletionLogprobs(BaseModel): - content: List[ChatCompletionLogprob] = Field(default_factory=list) + content: list[ChatCompletionLogprob] = Field(default_factory=list) class ChatCompletionImageUrl(BaseModel): @@ -23,58 +25,58 @@ class ChatCompletionImageUrl(BaseModel): class ChatCompletionMessagePart(BaseModel): type: Literal["text", "image_url"] = "text" - text: Optional[str] = None - image_url: Optional[ChatCompletionImageUrl] = None + text: str | None = None + image_url: ChatCompletionImageUrl | None = None class ChatCompletionMessage(BaseModel): role: str = "user" - content: Optional[Union[str, List[ChatCompletionMessagePart]]] = None - tool_calls: Optional[List[ToolCall]] = None - tool_call_id: Optional[str] = None + content: str | list[ChatCompletionMessagePart] | None = None + tool_calls: list[ToolCall] | None = None + tool_call_id: str | None = None class ChatCompletionRespChoice(BaseModel): # Index is 0 since we aren't using multiple choices index: int = 0 - finish_reason: Optional[str] = None + finish_reason: str | None = None # let's us understand why it stopped and if we need to generate a tool_call - stop_str: Optional[str] = None + stop_str: str | None = None message: ChatCompletionMessage - logprobs: Optional[ChatCompletionLogprobs] = None + logprobs: ChatCompletionLogprobs | None = None class ChatCompletionStreamChoice(BaseModel): # Index is 0 since we aren't using multiple choices index: int = 0 - finish_reason: Optional[str] = None - delta: Union[ChatCompletionMessage, dict] = {} - logprobs: Optional[ChatCompletionLogprobs] = None + finish_reason: str | None = None + delta: ChatCompletionMessage | dict = {} + logprobs: ChatCompletionLogprobs | None = None # Inherited from common request class ChatCompletionRequest(CommonCompletionRequest): - messages: List[ChatCompletionMessage] - prompt_template: Optional[str] = None - add_generation_prompt: Optional[bool] = True - template_vars: Optional[dict] = Field( + messages: list[ChatCompletionMessage] + prompt_template: str | None = None + add_generation_prompt: bool | None = True + template_vars: dict | None = Field( default={}, validation_alias=AliasChoices("template_vars", "chat_template_kwargs"), description="Aliases: chat_template_kwargs", ) - response_prefix: Optional[str] = None - model: Optional[str] = None + response_prefix: str | None = None + model: str | None = None # tools is follows the format OAI schema, functions is more flexible # both are available in the chat template. - tools: Optional[List[ToolSpec]] = None - functions: Optional[List[Dict]] = None + tools: list[ToolSpec] | None = None + functions: list[dict] | None = None # Chat completions requests do not have a BOS token preference. Backend # respects the tokenization config for the individual model. - add_bos_token: Optional[bool] = None + add_bos_token: bool | None = None @field_validator("add_bos_token", mode="after") def force_bos_token(cls, v): @@ -84,17 +86,17 @@ def force_bos_token(cls, v): class ChatCompletionResponse(BaseModel): id: str = Field(default_factory=lambda: f"chatcmpl-{uuid4().hex}") - choices: List[ChatCompletionRespChoice] + choices: list[ChatCompletionRespChoice] created: int = Field(default_factory=lambda: int(time())) model: str object: str = "chat.completion" - usage: Optional[UsageStats] = None + usage: UsageStats | None = None class ChatCompletionStreamChunk(BaseModel): id: str = Field(default_factory=lambda: f"chatcmpl-{uuid4().hex}") - choices: List[ChatCompletionStreamChoice] + choices: list[ChatCompletionStreamChoice] created: int = Field(default_factory=lambda: int(time())) model: str object: str = "chat.completion.chunk" - usage: Optional[UsageStats] = None + usage: UsageStats | None = None diff --git a/endpoints/OAI/types/common.py b/endpoints/OAI/types/common.py index 16ef2edd..fb7a4a82 100644 --- a/endpoints/OAI/types/common.py +++ b/endpoints/OAI/types/common.py @@ -1,7 +1,6 @@ """Common types for OAI.""" from pydantic import BaseModel, Field -from typing import Optional, Union from common.sampling import BaseSamplerRequest, get_default_sampler_value @@ -10,13 +9,13 @@ class UsageStats(BaseModel): """Represents usage stats.""" prompt_tokens: int - prompt_time: Optional[float] = None - prompt_tokens_per_sec: Optional[Union[float, str]] = None + prompt_time: float | None = None + prompt_tokens_per_sec: float | str | None = None completion_tokens: int - completion_time: Optional[float] = None - completion_tokens_per_sec: Optional[Union[float, str]] = None + completion_time: float | None = None + completion_tokens_per_sec: float | str | None = None total_tokens: int - total_time: Optional[float] = None + total_time: float | None = None class CompletionResponseFormat(BaseModel): @@ -24,7 +23,7 @@ class CompletionResponseFormat(BaseModel): class ChatCompletionStreamOptions(BaseModel): - include_usage: Optional[bool] = False + include_usage: bool | None = False class CommonCompletionRequest(BaseSamplerRequest): @@ -32,29 +31,29 @@ class CommonCompletionRequest(BaseSamplerRequest): # Model information # This parameter is not used, the loaded model is used instead - model: Optional[str] = None + model: str | None = None # Generation info (remainder is in BaseSamplerRequest superclass) - stream: Optional[bool] = False - stream_options: Optional[ChatCompletionStreamOptions] = None - response_format: Optional[CompletionResponseFormat] = Field( + stream: bool | None = False + stream_options: ChatCompletionStreamOptions | None = None + response_format: CompletionResponseFormat | None = Field( default_factory=CompletionResponseFormat ) - n: Optional[int] = Field( + n: int | None = Field( default_factory=lambda: get_default_sampler_value("n", 1), ge=1, ) # Extra OAI request stuff - best_of: Optional[int] = Field( + best_of: int | None = Field( description="Not parsed. Only used for OAI compliance.", default=None ) - echo: Optional[bool] = Field( + echo: bool | None = Field( description="Not parsed. Only used for OAI compliance.", default=False ) - suffix: Optional[str] = Field( + suffix: str | None = Field( description="Not parsed. Only used for OAI compliance.", default=None ) - user: Optional[str] = Field( + user: str | None = Field( description="Not parsed. Only used for OAI compliance.", default=None ) diff --git a/endpoints/OAI/types/completion.py b/endpoints/OAI/types/completion.py index d0a7187e..bb3f8443 100644 --- a/endpoints/OAI/types/completion.py +++ b/endpoints/OAI/types/completion.py @@ -2,7 +2,6 @@ from pydantic import BaseModel, Field from time import time -from typing import Dict, List, Optional, Union from uuid import uuid4 from endpoints.OAI.types.common import CommonCompletionRequest, UsageStats @@ -11,10 +10,10 @@ class CompletionLogProbs(BaseModel): """Represents log probabilities for a completion request.""" - text_offset: List[int] = Field(default_factory=list) - token_logprobs: List[Optional[float]] = Field(default_factory=list) - tokens: List[str] = Field(default_factory=list) - top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list) + text_offset: list[int] = Field(default_factory=list) + token_logprobs: list[float | None] = Field(default_factory=list) + tokens: list[str] = Field(default_factory=list) + top_logprobs: list[dict[str, float] | None] = Field(default_factory=list) class CompletionRespChoice(BaseModel): @@ -22,8 +21,8 @@ class CompletionRespChoice(BaseModel): # Index is 0 since we aren't using multiple choices index: int = 0 - finish_reason: Optional[str] = None - logprobs: Optional[CompletionLogProbs] = None + finish_reason: str | None = None + logprobs: CompletionLogProbs | None = None text: str @@ -33,15 +32,15 @@ class CompletionRequest(CommonCompletionRequest): # Prompt can also contain token ids, but that's out of scope # for this project. - prompt: Union[str, List[str]] + prompt: str | list[str] class CompletionResponse(BaseModel): """Represents a completion response.""" id: str = Field(default_factory=lambda: f"cmpl-{uuid4().hex}") - choices: List[CompletionRespChoice] + choices: list[CompletionRespChoice] created: int = Field(default_factory=lambda: int(time())) model: str object: str = "text_completion" - usage: Optional[UsageStats] = None + usage: UsageStats | None = None diff --git a/endpoints/OAI/types/embedding.py b/endpoints/OAI/types/embedding.py index 41419c4d..7eaefcf9 100644 --- a/endpoints/OAI/types/embedding.py +++ b/endpoints/OAI/types/embedding.py @@ -1,23 +1,21 @@ -from typing import List, Optional, Union - from pydantic import BaseModel, Field class UsageInfo(BaseModel): prompt_tokens: int = 0 total_tokens: int = 0 - completion_tokens: Optional[int] = 0 + completion_tokens: int | None = 0 class EmbeddingsRequest(BaseModel): - input: Union[str, List[str]] = Field( - ..., description="List of input texts to generate embeddings for." + input: str | list[str] = Field( + ..., description="list of input texts to generate embeddings for." ) encoding_format: str = Field( "float", description="Encoding format for the embeddings. Can be 'float' or 'base64'.", ) - model: Optional[str] = Field( + model: str | None = Field( None, description="Name of the embedding model to use. " "If not provided, the default model will be used.", @@ -26,8 +24,8 @@ class EmbeddingsRequest(BaseModel): class EmbeddingObject(BaseModel): object: str = Field("embedding", description="Type of the object.") - embedding: Union[List[float], str] = Field( - ..., description="Embedding values as a list of floats." + embedding: list[float] | str = Field( + ..., description="Embedding values as a List of floats." ) index: int = Field( ..., description="Index of the input text corresponding to the embedding." @@ -36,6 +34,6 @@ class EmbeddingObject(BaseModel): class EmbeddingsResponse(BaseModel): object: str = Field("list", description="Type of the response object.") - data: List[EmbeddingObject] = Field(..., description="List of embedding objects.") + data: list[EmbeddingObject] = Field(..., description="List of embedding objects.") model: str = Field(..., description="Name of the embedding model used.") usage: UsageInfo = Field(..., description="Information about token usage.") diff --git a/endpoints/OAI/types/tools.py b/endpoints/OAI/types/tools.py index b5b9611f..138fda9d 100644 --- a/endpoints/OAI/types/tools.py +++ b/endpoints/OAI/types/tools.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, Field -from typing import Dict, Literal +from typing import Literal from uuid import uuid4 @@ -8,7 +8,7 @@ class Function(BaseModel): name: str description: str - parameters: Dict[str, object] + parameters: dict[str, object] class ToolSpec(BaseModel): diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index b559bb2b..819a409c 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -3,7 +3,6 @@ import asyncio import pathlib from asyncio import CancelledError -from typing import List, Optional from fastapi import HTTPException, Request from jinja2 import TemplateError from loguru import logger @@ -33,7 +32,7 @@ def _create_response( - request_id: str, generations: List[dict], model_name: Optional[str] + request_id: str, generations: list[dict], model_name: str | None ): """Create a chat completion response from the provided text.""" @@ -70,7 +69,7 @@ def _create_response( logprob_response = ChatCompletionLogprobs(content=collected_token_probs) - # Set finish reason + # set finish reason if message.tool_calls: finish_reason = "tool_calls" else: @@ -111,8 +110,8 @@ def _create_response( def _create_stream_chunk( request_id: str, - generation: Optional[dict] = None, - model_name: Optional[str] = None, + generation: dict | None = None, + model_name: str | None = None, is_usage_chunk: bool = False, ): """Create a chat completion stream chunk from the provided text.""" @@ -212,8 +211,8 @@ async def _append_template_metadata(data: ChatCompletionRequest, template_vars: async def format_messages_with_template( - messages: List[ChatCompletionMessage], - existing_template_vars: Optional[dict] = None, + messages: list[ChatCompletionMessage], + existing_template_vars: dict | None = None, ): """Barebones function to format chat completion messages into a prompt.""" @@ -221,7 +220,7 @@ async def format_messages_with_template( mm_embeddings = MultimodalEmbeddingWrapper() if model.container.use_vision else None # Convert all messages to a dictionary representation - message_dicts: List[dict] = [] + message_dicts: list[dict] = [] for message in messages: if isinstance(message.content, list): concatenated_content = "" @@ -317,7 +316,7 @@ async def stream_generate_chat_completion( """Generator for the generation process.""" abort_event = asyncio.Event() gen_queue = asyncio.Queue() - gen_tasks: List[asyncio.Task] = [] + gen_tasks: list[asyncio.Task] = [] tool_start = model.container.prompt_template.metadata.tool_start disconnect_task = asyncio.create_task(request_disconnect_loop(request)) @@ -414,7 +413,7 @@ async def generate_chat_completion( request: Request, model_path: pathlib.Path, ): - gen_tasks: List[asyncio.Task] = [] + gen_tasks: list[asyncio.Task] = [] tool_start = model.container.prompt_template.metadata.tool_start try: @@ -462,14 +461,14 @@ async def generate_tool_calls( prompt: str, embeddings: MultimodalEmbeddingWrapper, data: ChatCompletionRequest, - generations: List[str], + generations: list[str], request: Request, ): - gen_tasks: List[asyncio.Task] = [] + gen_tasks: list[asyncio.Task] = [] tool_start = model.container.prompt_template.metadata.tool_start # Tracks which generations asked for a tool call - tool_idx: List[int] = [] + tool_idx: list[int] = [] # Copy to make sure the parent JSON schema doesn't get modified tool_data = data.model_copy(deep=True) diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py index f66d381d..4de10361 100644 --- a/endpoints/OAI/utils/completion.py +++ b/endpoints/OAI/utils/completion.py @@ -9,7 +9,6 @@ from asyncio import CancelledError from fastapi import HTTPException, Request from loguru import logger -from typing import List, Optional, Union from common import model from common.auth import get_key_permission @@ -39,7 +38,7 @@ def _parse_gen_request_id(n: int, request_id: str, task_idx: int): def _create_response( - request_id: str, generations: Union[dict, List[dict]], model_name: str = "" + request_id: str, generations: dict | list[dict], model_name: str = "" ): """Create a completion response from the provided choices.""" @@ -47,7 +46,7 @@ def _create_response( if not isinstance(generations, list): generations = [generations] - choices: List[CompletionRespChoice] = [] + choices: list[CompletionRespChoice] = [] for index, generation in enumerate(generations): logprob_response = None @@ -103,7 +102,7 @@ async def _stream_collector( prompt: str, params: CompletionRequest, abort_event: asyncio.Event, - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None, + mm_embeddings: MultimodalEmbeddingWrapper | None = None, ): """Collects a stream and places results in a common queue""" @@ -200,7 +199,7 @@ async def stream_generate_completion( abort_event = asyncio.Event() gen_queue = asyncio.Queue() - gen_tasks: List[asyncio.Task] = [] + gen_tasks: list[asyncio.Task] = [] disconnect_task = asyncio.create_task(request_disconnect_loop(request)) try: @@ -261,7 +260,7 @@ async def generate_completion( ): """Non-streaming generate for completions""" - gen_tasks: List[asyncio.Task] = [] + gen_tasks: list[asyncio.Task] = [] try: logger.info(f"Received completion request {request.state.id}") diff --git a/endpoints/OAI/utils/tools.py b/endpoints/OAI/utils/tools.py index c1ebdedf..66e466fe 100644 --- a/endpoints/OAI/utils/tools.py +++ b/endpoints/OAI/utils/tools.py @@ -1,6 +1,5 @@ import json from loguru import logger -from typing import List from endpoints.OAI.types.tools import ToolCall @@ -30,7 +29,7 @@ class ToolCallProcessor: @staticmethod - def from_json(tool_calls_str: str) -> List[ToolCall]: + def from_json(tool_calls_str: str) -> list[ToolCall]: """Postprocess tool call JSON to a parseable class""" tool_calls = json.loads(tool_calls_str) @@ -42,15 +41,15 @@ def from_json(tool_calls_str: str) -> List[ToolCall]: return [ToolCall(**tool_call) for tool_call in tool_calls] @staticmethod - def dump(tool_calls: List[ToolCall]) -> List[dict]: + def dump(tool_calls: list[ToolCall]) -> list[dict]: """ Convert ToolCall objects to a list of dictionaries. Args: - tool_calls (List[ToolCall]): List of ToolCall objects to convert + tool_calls (list[ToolCall]): list of ToolCall objects to convert Returns: - List[dict]: List of dictionaries representing the tool calls + list[dict]: list of dictionaries representing the tool calls """ # Don't use list comprehension here @@ -64,12 +63,12 @@ def dump(tool_calls: List[ToolCall]) -> List[dict]: return dumped_tool_calls @staticmethod - def to_json(tool_calls: List[ToolCall]) -> str: + def to_json(tool_calls: list[ToolCall]) -> str: """ Convert ToolCall objects to JSON string representation. Args: - tool_calls (List[ToolCall]): List of ToolCall objects to convert + tool_calls (list[ToolCall]): list of ToolCall objects to convert Returns: str: JSON representation of the tool calls diff --git a/endpoints/core/router.py b/endpoints/core/router.py index 800c7d90..3b362a73 100644 --- a/endpoints/core/router.py +++ b/endpoints/core/router.py @@ -1,7 +1,6 @@ import asyncio import pathlib from sys import maxsize -from typing import Optional from common.multimodal import MultimodalEmbeddingWrapper from fastapi import APIRouter, Depends, HTTPException, Request, Response from fastapi.responses import JSONResponse @@ -275,7 +274,7 @@ async def load_lora(data: LoraLoadRequest) -> LoraLoadResponse: if not data.loras: error_message = handle_request_error( - "List of loras to load is not found.", + "list of loras to load is not found.", exc_info=False, ).error.message @@ -403,7 +402,7 @@ async def unload_embedding_model(): async def encode_tokens(data: TokenEncodeRequest) -> TokenEncodeResponse: """Encodes a string or chat completion messages into tokens.""" - mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None + mm_embeddings: MultimodalEmbeddingWrapper | None = None if isinstance(data.text, str): text = data.text @@ -547,7 +546,7 @@ async def unload_template(): @router.get("/v1/sampling/override/list", dependencies=[Depends(check_api_key)]) async def list_sampler_overrides(request: Request) -> SamplerOverrideListResponse: """ - List all currently applied sampler overrides. + list all currently applied sampler overrides. Requires an admin key to see all override presets. """ diff --git a/endpoints/core/types/download.py b/endpoints/core/types/download.py index cf49501f..c9f8a040 100644 --- a/endpoints/core/types/download.py +++ b/endpoints/core/types/download.py @@ -1,5 +1,4 @@ from pydantic import BaseModel, Field -from typing import List, Optional def _generate_include_list(): @@ -11,13 +10,13 @@ class DownloadRequest(BaseModel): repo_id: str repo_type: str = "model" - folder_name: Optional[str] = None - revision: Optional[str] = None - token: Optional[str] = None - include: List[str] = Field(default_factory=_generate_include_list) - exclude: List[str] = Field(default_factory=list) - chunk_limit: Optional[int] = None - timeout: Optional[int] = None + folder_name: str | None = None + revision: str | None = None + token: str | None = None + include: list[str] = Field(default_factory=_generate_include_list) + exclude: list[str] = Field(default_factory=list) + chunk_limit: int | None = None + timeout: int | None = None class DownloadResponse(BaseModel): diff --git a/endpoints/core/types/health.py b/endpoints/core/types/health.py index ad5fffef..189fa008 100644 --- a/endpoints/core/types/health.py +++ b/endpoints/core/types/health.py @@ -11,5 +11,5 @@ class HealthCheckResponse(BaseModel): "healthy", description="System health status" ) issues: list[UnhealthyEvent] = Field( - default_factory=list, description="List of issues" + default_factory=list, description="list of issues" ) diff --git a/endpoints/core/types/lora.py b/endpoints/core/types/lora.py index 8435a8a4..88e9ba8f 100644 --- a/endpoints/core/types/lora.py +++ b/endpoints/core/types/lora.py @@ -2,7 +2,6 @@ from pydantic import BaseModel, Field from time import time -from typing import Optional, List class LoraCard(BaseModel): @@ -12,32 +11,32 @@ class LoraCard(BaseModel): object: str = "lora" created: int = Field(default_factory=lambda: int(time())) owned_by: str = "tabbyAPI" - scaling: Optional[float] = None + scaling: float | None = None class LoraList(BaseModel): """Represents a list of Lora cards.""" object: str = "list" - data: List[LoraCard] = Field(default_factory=list) + data: list[LoraCard] = Field(default_factory=list) class LoraLoadInfo(BaseModel): """Represents a single Lora load info.""" name: str - scaling: Optional[float] = 1.0 + scaling: float | None = 1.0 class LoraLoadRequest(BaseModel): """Represents a Lora load request.""" - loras: List[LoraLoadInfo] + loras: list[LoraLoadInfo] skip_queue: bool = False class LoraLoadResponse(BaseModel): """Represents a Lora load response.""" - success: List[str] = Field(default_factory=list) - failure: List[str] = Field(default_factory=list) + success: list[str] = Field(default_factory=list) + failure: list[str] = Field(default_factory=list) diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 84229294..0f4252fb 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -1,8 +1,10 @@ """Contains model card types.""" +from __future__ import annotations + from pydantic import BaseModel, Field, ConfigDict from time import time -from typing import List, Literal, Optional, Union +from typing import Literal from common.config_models import LoggingConfig from common.tabby_config import config @@ -13,19 +15,19 @@ class ModelCardParameters(BaseModel): # Safe to do this since it's guaranteed to fetch a max seq len # from model_container - max_seq_len: Optional[int] = None - cache_size: Optional[int] = None - cache_mode: Optional[str] = "FP16" - rope_scale: Optional[float] = 1.0 - rope_alpha: Optional[float] = 1.0 - max_batch_size: Optional[int] = 1 - chunk_size: Optional[int] = 2048 - prompt_template: Optional[str] = None - prompt_template_content: Optional[str] = None - use_vision: Optional[bool] = False + max_seq_len: int | None = None + cache_size: int | None = None + cache_mode: str | None = "FP16" + rope_scale: float | None = 1.0 + rope_alpha: float | None = 1.0 + max_batch_size: int | None = 1 + chunk_size: int | None = 2048 + prompt_template: str | None = None + prompt_template_content: str | None = None + use_vision: bool | None = False # Draft is another model, so include it in the card params - draft: Optional["ModelCard"] = None + draft: ModelCard | None = None class ModelCard(BaseModel): @@ -35,15 +37,15 @@ class ModelCard(BaseModel): object: str = "model" created: int = Field(default_factory=lambda: int(time())) owned_by: str = "tabbyAPI" - logging: Optional[LoggingConfig] = None - parameters: Optional[ModelCardParameters] = None + logging: LoggingConfig | None = None + parameters: ModelCardParameters | None = None class ModelList(BaseModel): """Represents a list of model cards.""" object: str = "list" - data: List[ModelCard] = Field(default_factory=list) + data: list[ModelCard] = Field(default_factory=list) class DraftModelLoadRequest(BaseModel): @@ -53,13 +55,13 @@ class DraftModelLoadRequest(BaseModel): draft_model_name: str # Config arguments - draft_rope_scale: Optional[float] = None - draft_rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( + draft_rope_scale: float | None = None + draft_rope_alpha: float | Literal["auto"] | None = Field( description='Automatically calculated if set to "auto"', default=None, examples=[1.0], ) - draft_gpu_split: Optional[List[float]] = Field( + draft_gpu_split: list[float] | None = Field( default_factory=list, examples=[[24.0, 20.0]], ) @@ -75,54 +77,54 @@ class ModelLoadRequest(BaseModel): model_name: str # Config arguments - backend: Optional[str] = Field( + backend: str | None = Field( description="Backend to use", default=None, ) - max_seq_len: Optional[int] = Field( + max_seq_len: int | None = Field( description="Leave this blank to use the model's base sequence length", default=None, examples=[4096], ) - cache_size: Optional[int] = Field( + cache_size: int | None = Field( description="Number in tokens, must be multiple of 256", default=None, examples=[4096], ) - cache_mode: Optional[str] = None - tensor_parallel: Optional[bool] = None - tensor_parallel_backend: Optional[str] = "native" - gpu_split_auto: Optional[bool] = None - autosplit_reserve: Optional[List[float]] = None - gpu_split: Optional[List[float]] = Field( + cache_mode: str | None = None + tensor_parallel: bool | None = None + tensor_parallel_backend: str | None = "native" + gpu_split_auto: bool | None = None + autosplit_reserve: list[float] | None = None + gpu_split: list[float] | None = Field( default_factory=list, examples=[[24.0, 20.0]], ) - rope_scale: Optional[float] = Field( + rope_scale: float | None = Field( description="Automatically pulled from the model's config if not present", default=None, examples=[1.0], ) - rope_alpha: Optional[Union[float, Literal["auto"]]] = Field( + rope_alpha: float | Literal["auto"] | None = Field( description='Automatically calculated if set to "auto"', default=None, examples=[1.0], ) - chunk_size: Optional[int] = None - output_chunking: Optional[bool] = True - prompt_template: Optional[str] = None - vision: Optional[bool] = None + chunk_size: int | None = None + output_chunking: bool | None = True + prompt_template: str | None = None + vision: bool | None = None # Non-config arguments - draft_model: Optional[DraftModelLoadRequest] = None - skip_queue: Optional[bool] = False + draft_model: DraftModelLoadRequest | None = None + skip_queue: bool | None = False class EmbeddingModelLoadRequest(BaseModel): embedding_model_name: str - # Set default from the config - embeddings_device: Optional[str] = Field(config.embeddings.embeddings_device) + # set default from the config + embeddings_device: str | None = Field(config.embeddings.embeddings_device) class ModelLoadResponse(BaseModel): diff --git a/endpoints/core/types/sampler_overrides.py b/endpoints/core/types/sampler_overrides.py index 18627829..2a2efab0 100644 --- a/endpoints/core/types/sampler_overrides.py +++ b/endpoints/core/types/sampler_overrides.py @@ -1,5 +1,4 @@ from pydantic import BaseModel, Field -from typing import List, Optional from common.sampling import SamplerOverridesContainer @@ -7,17 +6,17 @@ class SamplerOverrideListResponse(SamplerOverridesContainer): """Sampler override list response""" - presets: Optional[List[str]] + presets: list[str] | None class SamplerOverrideSwitchRequest(BaseModel): """Sampler override switch request""" - preset: Optional[str] = Field( + preset: str | None = Field( default=None, description="Pass a sampler override preset name" ) - overrides: Optional[dict] = Field( + overrides: dict | None = Field( default=None, description=( "Sampling override parent takes in individual keys and overrides. " diff --git a/endpoints/core/types/template.py b/endpoints/core/types/template.py index a82ef48d..a3b98fb2 100644 --- a/endpoints/core/types/template.py +++ b/endpoints/core/types/template.py @@ -1,12 +1,11 @@ from pydantic import BaseModel, Field -from typing import List class TemplateList(BaseModel): """Represents a list of templates.""" object: str = "list" - data: List[str] = Field(default_factory=list) + data: list[str] = Field(default_factory=list) class TemplateSwitchRequest(BaseModel): diff --git a/endpoints/core/types/token.py b/endpoints/core/types/token.py index d43e65e4..8a28d880 100644 --- a/endpoints/core/types/token.py +++ b/endpoints/core/types/token.py @@ -1,7 +1,6 @@ """Tokenization types""" from pydantic import BaseModel -from typing import List, Union from endpoints.OAI.types.chat_completion import ChatCompletionMessage @@ -25,20 +24,20 @@ def get_params(self): class TokenEncodeRequest(CommonTokenRequest): """Represents a tokenization request.""" - text: Union[str, List[ChatCompletionMessage]] + text: str | list[ChatCompletionMessage] class TokenEncodeResponse(BaseModel): """Represents a tokenization response.""" - tokens: List[int] + tokens: list[int] length: int class TokenDecodeRequest(CommonTokenRequest): """ " Represents a detokenization request.""" - tokens: List[int] + tokens: list[int] class TokenDecodeResponse(BaseModel): diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py index 20c9433c..fa11186e 100644 --- a/endpoints/core/utils/model.py +++ b/endpoints/core/utils/model.py @@ -1,6 +1,5 @@ import pathlib from asyncio import CancelledError -from typing import Optional from common import model from common.networking import get_generator_error, handle_request_disconnect @@ -13,7 +12,7 @@ ) -def get_model_list(model_path: pathlib.Path, draft_model_path: Optional[str] = None): +def get_model_list(model_path: pathlib.Path, draft_model_path: str | None = None): """Get the list of models from the provided path.""" # Convert the provided draft model path to a pathlib path for @@ -83,7 +82,7 @@ async def stream_model_load( # Get trimmed load data load_data = data.model_dump(exclude_none=True) - # Set the draft model directory + # set the draft model directory load_data.setdefault("draft_model", {})["draft_model_dir"] = ( config.draft_model.draft_model_dir ) diff --git a/endpoints/server.py b/endpoints/server.py index c17cbb44..4287f747 100644 --- a/endpoints/server.py +++ b/endpoints/server.py @@ -3,7 +3,6 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from loguru import logger -from typing import Optional from common.logger import UVICORN_LOG_CONFIG from common.networking import get_global_depends @@ -13,7 +12,7 @@ from endpoints.core.router import router as CoreRouter -def setup_app(host: Optional[str] = None, port: Optional[int] = None): +def setup_app(host: str | None = None, port: int | None = None): """Includes the correct routers for startup""" app = FastAPI( diff --git a/main.py b/main.py index 661613eb..1409c656 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ """The main tabbyAPI module. Contains the FastAPI server and endpoints.""" -# Set this env var for cuda malloc async before torch is initalized +# set this env var for cuda malloc async before torch is initalized import os os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" @@ -11,7 +11,6 @@ import platform import signal from loguru import logger -from typing import Optional from common import gen_logging, sampling, model from common.args import convert_args_to_dict, init_argparser @@ -92,7 +91,7 @@ async def entrypoint_async(): gen_logging.broadcast_status() - # Set sampler parameter overrides if provided + # set sampler parameter overrides if provided sampling_override_preset = config.sampling.override_preset if sampling_override_preset: try: @@ -104,12 +103,12 @@ async def entrypoint_async(): def entrypoint( - args: Optional[argparse.Namespace] = None, - parser: Optional[argparse.ArgumentParser] = None, + args: argparse.Namespace | None = None, + parser: argparse.ArgumentParser | None = None, ): setup_logger() - # Set up signal aborting + # set up signal aborting signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) @@ -118,7 +117,7 @@ def entrypoint( else: from uvloop import install - # Set loop event policy + # set loop event policy install() # Parse and override config from args @@ -158,7 +157,7 @@ def entrypoint( raise SystemExit(install_message) - # Set the process priority + # set the process priority if config.developer.realtime_process_priority: import psutil diff --git a/start.py b/start.py index 95bdd366..20c4d489 100644 --- a/start.py +++ b/start.py @@ -9,7 +9,6 @@ import sys import traceback from shutil import copyfile, which -from typing import List # Checks for uv installation has_uv = which("uv") is not None @@ -154,7 +153,7 @@ def migrate_start_options(start_options: dict): return migrated -def run_pip(command: List[str]): +def run_pip(command: list[str]): if has_uv: command.insert(0, "uv") @@ -204,10 +203,10 @@ def run_pip(command: List[str]): "Getting things ready..." ) - # Set variables that rely on start options + # set variables that rely on start options first_run = not start_options.get("first_run_done") - # Set gpu_lib for dependency install + # set gpu_lib for dependency install if args.gpu_lib: print("Overriding GPU lib name from args.") gpu_lib = args.gpu_lib From 22cb816b5ea5a08f00899ea24d535ec44410b1b5 Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Fri, 31 Oct 2025 20:58:42 +0430 Subject: [PATCH 2/3] fix --- backends/exllamav2/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 903b4929..74024f63 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -271,6 +271,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs self.config.max_seq_len = unwrap( user_max_seq_len, min(hf_model.hf_config.max_position_embeddings, 4096) ) + self.cache_size = self.config.max_seq_len # set the rope scale self.config.scale_pos_emb = unwrap( From 4b8c210a8823951345e4be0c11bda925932a291a Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Fri, 31 Oct 2025 21:24:27 +0430 Subject: [PATCH 3/3] Apply suggestions from code review --- backends/exllamav2/model.py | 34 +++++++++++++------------- backends/exllamav3/model.py | 12 ++++----- colab/TabbyAPI_Colab_Example.ipynb | 6 ++--- common/auth.py | 2 +- common/config_models.py | 12 ++++----- common/tabby_config.py | 2 +- config_sample.yml | 14 +++++------ docker/Dockerfile | 4 +-- docs/01.-Getting-Started.md | 2 +- docs/02.-Server-options.md | 10 ++++---- docs/06.-Sharing.md | 2 +- endpoints/OAI/router.py | 4 +-- endpoints/OAI/types/embedding.py | 4 +-- endpoints/OAI/utils/chat_completion.py | 2 +- endpoints/core/router.py | 4 +-- endpoints/core/types/model.py | 2 +- endpoints/core/utils/model.py | 2 +- main.py | 10 ++++---- start.py | 4 +-- 19 files changed, 66 insertions(+), 66 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 74024f63..5c5c934c 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -129,7 +129,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Check if the model arch is compatible with various exl2 features self.config.arch_compat_overrides() - # set vision state and error if vision isn't supported on the current model + # Set vision state and error if vision isn't supported on the current model self.use_vision = unwrap(kwargs.get("vision"), False) if self.use_vision and not self.config.vision_model_type: raise ValueError( @@ -184,12 +184,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs gpu_split = unwrap(kwargs.get("gpu_split"), []) gpu_device_list = list(range(0, gpu_count)) - # set GPU split options + # Set GPU split options if gpu_count == 1: self.gpu_split_auto = False logger.info("Disabling GPU split because one GPU is in use.") else: - # set tensor parallel + # Set tensor parallel if use_tp: self.use_tp = True @@ -232,7 +232,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Hardcode max output length to 16 self.config.max_output_len = 16 - # set max batch size to the config override + # Set max batch size to the config override self.max_batch_size = unwrap(kwargs.get("max_batch_size")) # Check whether the user's configuration supports flash/paged attention @@ -261,7 +261,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Grab user-set max seq len user_max_seq_len = kwargs.get("max_seq_len") - # set k/v cache size + # Set k/v cache size # cache_size is only relevant when paged mode is enabled if self.paged: user_cache_size = coalesce(kwargs.get("cache_size"), user_max_seq_len, 4096) @@ -273,7 +273,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs ) self.cache_size = self.config.max_seq_len - # set the rope scale + # Set the rope scale self.config.scale_pos_emb = unwrap( kwargs.get("rope_scale"), self.config.scale_pos_emb ) @@ -322,7 +322,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs self.config.max_input_len = chunk_size self.config.max_attention_size = chunk_size**2 - # set user-configured draft model values + # Set user-configured draft model values if self.use_draft_model: self.draft_config.max_seq_len = self.config.max_seq_len @@ -330,7 +330,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs draft_args.get("draft_rope_scale"), 1.0 ) - # set draft rope alpha. Follows same behavior as model rope alpha. + # Set draft rope alpha. Follows same behavior as model rope alpha. # Use the max_position_embeddings of the model draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto") if draft_rope_alpha == "auto": @@ -341,7 +341,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs else: self.draft_config.scale_alpha_value = draft_rope_alpha - # set draft cache mode + # Set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") # Catch exllamav3 draft_cache_mode @@ -836,7 +836,7 @@ async def unload(self, loras_only: bool = False, **kwargs): await self.generator.close() self.generator = None - # set all model state variables to False + # Set all model state variables to False self.loaded = False gc.collect() @@ -1136,15 +1136,15 @@ def assign_gen_params( "top_k, top_p, and typical to 1.0, 1, 0, and 0." ) - # set banned tokens + # Set banned tokens if params.banned_tokens: gen_settings.disallow_tokens(self.tokenizer, params.banned_tokens) - # set allowed tokens + # Set allowed tokens if params.allowed_tokens: gen_settings.allow_tokens(self.tokenizer, params.allowed_tokens) - # set logit bias + # Set logit bias if params.logit_bias: # Create a vocab tensor if it doesn't exist for token biasing if gen_settings.token_bias is None: @@ -1261,7 +1261,7 @@ async def generate_gen( grammar_handler, ) - # set banned strings + # Set banned strings banned_strings = params.banned_strings if banned_strings and len(grammar_handler.filters) > 0: logger.warning( @@ -1271,7 +1271,7 @@ async def generate_gen( banned_strings = [] - # set CFG scale and negative prompt + # Set CFG scale and negative prompt cfg_scale = params.cfg_scale negative_prompt = None if cfg_scale not in [None, 1.0]: @@ -1301,7 +1301,7 @@ async def generate_gen( stop_conditions = params.stop ban_eos_token = params.ban_eos_token - # set add_bos_token for generation + # Set add_bos_token for generation add_bos_token = unwrap(params.add_bos_token, self.hf_model.add_bos_token()) # Fetch EOS tokens from the HF model if they exist @@ -1309,7 +1309,7 @@ async def generate_gen( # Ban the EOS token if specified. If not, append to stop conditions # as well. - # set this below logging to avoid polluting the stop strings array + # Set this below logging to avoid polluting the stop strings array if ban_eos_token: gen_settings.disallow_tokens(self.tokenizer, eos_tokens) else: diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 4b369c7f..389f581b 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -157,12 +157,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs gpu_device_list = list(range(0, gpu_count)) use_tp = unwrap(kwargs.get("tensor_parallel"), False) - # set GPU split options + # Set GPU split options if gpu_count == 1: self.gpu_split_auto = False logger.info("Disabling GPU split because one GPU is in use.") else: - # set tensor parallel + # Set tensor parallel if use_tp: self.use_tp = True tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native") @@ -179,7 +179,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # TP has its own autosplit loader self.gpu_split_auto = False - # set GPU split options + # Set GPU split options # Enable manual GPU split if provided if gpu_split: self.gpu_split_auto = False @@ -235,7 +235,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs # Draft cache if self.use_draft_model: - # set draft cache mode + # Set draft cache mode self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16") self.draft_cache = self.create_cache( self.draft_cache_mode, self.draft_model @@ -870,7 +870,7 @@ async def generate_gen( # Penalties - # set penalty range + # Set penalty range penalty_range = unwrap(params.penalty_range, self.max_seq_len) # Exl3's version of including the entire context @@ -911,7 +911,7 @@ async def generate_gen( sampler_builder.temperature(params.temperature) # Build the sampler - # set greedy if temperature is 0 + # Set greedy if temperature is 0 sampler = sampler_builder.build(params.temperature == 0) # Dynamically scale penalty range to output tokens diff --git a/colab/TabbyAPI_Colab_Example.ipynb b/colab/TabbyAPI_Colab_Example.ipynb index c23bd345..b8e32f0a 100644 --- a/colab/TabbyAPI_Colab_Example.ipynb +++ b/colab/TabbyAPI_Colab_Example.ipynb @@ -184,13 +184,13 @@ " # Only use if your model was trained on long context with rope (check config.json)\n", " rope_alpha: {RopeAlpha}\n", "\n", - " # Disable Flash-attention 2. set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n", + " # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n", " no_flash_attention: {NoFlashAttention}\n", "\n", " # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n", " cache_mode: {CacheMode}\n", "\n", - " # set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n", + " # Set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n", " # NOTE: Only works with chat completion message lists!\n", " prompt_template: {PromptTemplate}\n", "\n", @@ -218,7 +218,7 @@ " # Overrides the directory to look for loras (default: loras)\n", " lora_dir: loras\n", "\n", - " # list of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n", + " # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n", " loras:\n", " - name: {lora}\n", " scaling: {LoraScaling}\n", diff --git a/common/auth.py b/common/auth.py index c986fc40..bd93afe0 100644 --- a/common/auth.py +++ b/common/auth.py @@ -51,7 +51,7 @@ async def load_auth_keys(disable_from_config: bool): if disable_from_config: logger.warning( "Disabling authentication makes your instance vulnerable. " - "set the `disable_auth` flag to False in config.yml if you " + "Set the `disable_auth` flag to False in config.yml if you " "want to share this instance with others." ) diff --git a/common/config_models.py b/common/config_models.py index 781acdb3..c5772c42 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -250,7 +250,7 @@ class ModelConfig(BaseConfigModel): None, description=( "Rope alpha (default: None).\n" - 'Same as alpha_value. set to "auto" to auto-calculate.\n' + 'Same as alpha_value. Set to "auto" to auto-calculate.\n' "Leaving this value blank will either pull from the model " "or auto-calculate." ), @@ -277,7 +277,7 @@ class ModelConfig(BaseConfigModel): max_batch_size: int | None = Field( None, description=( - "set the maximum number of prompts to process at one time " + "Set the maximum number of prompts to process at one time " "(default: None/Automatic).\n" "Automatically calculated if left blank.\n" "NOTE: Only available for Nvidia ampere (30 series) and above GPUs." @@ -287,7 +287,7 @@ class ModelConfig(BaseConfigModel): prompt_template: str | None = Field( None, description=( - "set the prompt template for this model. (default: None)\n" + "Set the prompt template for this model. (default: None)\n" "If empty, attempts to look for the model's chat template.\n" "If a model contains multiple templates in its tokenizer_config.json,\n" "set prompt_template to the name of the template you want to use.\n" @@ -335,7 +335,7 @@ class DraftModelConfig(BaseConfigModel): None, description=( "Rope alpha for draft models (default: None).\n" - 'Same as alpha_value. set to "auto" to auto-calculate.\n' + 'Same as alpha_value. Set to "auto" to auto-calculate.\n' "Leaving this value blank will either pull from the model " "or auto-calculate." ), @@ -390,7 +390,7 @@ class LoraConfig(BaseConfigModel): loras: list[LoraInstanceModel] | None = Field( None, description=( - "list of LoRAs to load and associated scaling factors " + "List of LoRAs to load and associated scaling factors " "(default scale: 1.0).\n" "For the YAML file, add each entry as a YAML list:\n" "- name: lora1\n" @@ -443,7 +443,7 @@ class DeveloperConfig(BaseConfigModel): realtime_process_priority: bool | None = Field( False, description=( - "set process to use a higher priority.\n" + "Set process to use a higher priority.\n" "For realtime process priority, run as administrator or sudo.\n" "Otherwise, the priority will be set to high." ), diff --git a/common/tabby_config.py b/common/tabby_config.py index 212746f1..535cb6b2 100644 --- a/common/tabby_config.py +++ b/common/tabby_config.py @@ -44,7 +44,7 @@ def load(self, arguments: dict | None = None): value = getattr(merged_config_model, field) setattr(self, field, value) - # set model defaults dict once to prevent on-demand reconstruction + # Set model defaults dict once to prevent on-demand reconstruction # TODO: clean this up a bit for field in self.model.use_as_default: if hasattr(self.model, field): diff --git a/config_sample.yml b/config_sample.yml index 88fa308a..0b65f9e8 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -79,7 +79,7 @@ model: backend: # Max sequence length (default: min(max_position_embeddings, cache_size)). - # set to -1 to fetch from the model's config.json + # Set to -1 to fetch from the model's config.json max_seq_len: # Size of the key/value cache to allocate, in tokens (default: 4096). @@ -126,7 +126,7 @@ model: rope_scale: 1.0 # Rope alpha (default: None). - # Same as alpha_value. set to "auto" to auto-calculate. + # Same as alpha_value. Set to "auto" to auto-calculate. # Leaving this value blank will either pull from the model or auto-calculate. rope_alpha: @@ -141,12 +141,12 @@ model: # Used by EXL3 models only. output_chunking: true - # set the maximum number of prompts to process at one time (default: None/Automatic). + # Set the maximum number of prompts to process at one time (default: None/Automatic). # Automatically calculated if left blank. # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. max_batch_size: - # set the prompt template for this model. (default: None) + # Set the prompt template for this model. (default: None) # If empty, attempts to look for the model's chat template. # If a model contains multiple templates in its tokenizer_config.json, # set prompt_template to the name of the template you want to use. @@ -172,7 +172,7 @@ draft_model: draft_rope_scale: 1.0 # Rope alpha for draft models (default: None). - # Same as alpha_value. set to "auto" to auto-calculate. + # Same as alpha_value. Set to "auto" to auto-calculate. # Leaving this value blank will either pull from the model or auto-calculate. draft_rope_alpha: @@ -199,7 +199,7 @@ lora: # Directory to look for LoRAs (default: loras). lora_dir: loras - # list of LoRAs to load and associated scaling factors (default scale: 1.0). + # List of LoRAs to load and associated scaling factors (default scale: 1.0). # For the YAML file, add each entry as a YAML list: # - name: lora1 # scaling: 1.0 @@ -230,7 +230,7 @@ developer: # Disable API request streaming (default: False). disable_request_streaming: false - # set process to use a higher priority. + # Set process to use a higher priority. # For realtime process priority, run as administrator or sudo. # Otherwise, the priority will be set to high. realtime_process_priority: false diff --git a/docker/Dockerfile b/docker/Dockerfile index ba038cec..58aa61f5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,7 +20,7 @@ ENV PATH="/opt/venv/bin:$PATH" # Upgrade pip RUN pip install --no-cache-dir --upgrade pip -# set the working directory in the container +# Set the working directory in the container WORKDIR /app # Get requirements @@ -37,7 +37,7 @@ COPY . . # Make port 5000 available to the world outside this container EXPOSE 5000 -# set the entry point +# Set the entry point ENTRYPOINT ["python3"] # Run main.py when the container launches diff --git a/docs/01.-Getting-Started.md b/docs/01.-Getting-Started.md index 7e74abd9..330116c6 100644 --- a/docs/01.-Getting-Started.md +++ b/docs/01.-Getting-Started.md @@ -171,7 +171,7 @@ These are short-form instructions for other methods that users can use to instal 2. For Windows: [Cuda Toolkit on WSL](https://docs.nvidia.com/cuda/wsl-user-guide/index.html) 3. Clone TabbyAPI via `git clone https://github.com/theroyallab/tabbyAPI` 4. Enter the tabbyAPI directory by `cd tabbyAPI`. - 1. Optional: set up a config.yml or api_tokens.yml ([configuration](#configuration)) + 1. Optional: Set up a config.yml or api_tokens.yml ([configuration](#configuration)) 5. Update the volume mount section in the `docker/docker-compose.yml` file ```yml volumes: diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md index 03f866ad..00546e1b 100644 --- a/docs/02.-Server-options.md +++ b/docs/02.-Server-options.md @@ -16,8 +16,8 @@ All of these options have descriptive comments above them. You should not need t | Config Option | Type (Default) | Description | | ---------------------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| host | String (127.0.0.1) | set the IP address used for hosting TabbyAPI | -| port | Int (5000) | set the TCP Port use for TabbyAPI | +| host | String (127.0.0.1) | Set the IP address used for hosting TabbyAPI | +| port | Int (5000) | Set the TCP Port use for TabbyAPI | | disable_auth | Bool (False) | Disables API authentication | | disable_fetch_requests | Bool (False) | Disables fetching external content when responding to requests (ex. fetching images from URLs) | | send_tracebacks | Bool (False) | Send server tracebacks to client.

Note: It's not recommended to enable this if sharing the instance with others. | @@ -47,7 +47,7 @@ Note: These are experimental flags that may be removed at any point. | ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | unsafe_launch | Bool (False) | Skips dependency checks on startup. Only recommended for debugging. | | disable_request_streaming | Bool (False) | Forcefully disables streaming requests | -| realtime_process_priority | Bool (False) | set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. | +| realtime_process_priority | Bool (False) | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. | ### Model Options @@ -58,7 +58,7 @@ Note: Most of the options here will only apply on initial model load/startup (ep | model_dir | String ("models") | Directory to look for models.

Note: Persisted across subsequent load requests | | inline_model_loading | Bool (False) | Enables ability to switch models using the `model` argument in a generation request. More info in [Usage](https://github.com/theroyallab/tabbyAPI/wiki/03.-Usage#inline-loading) | | use_dummy_models | Bool (False) | Send a dummy OAI model card when calling the `/v1/models` endpoint. Used for clients which enforce specific OAI models.

Note: Persisted across subsequent load requests | -| dummy_model_names | list[String] (["gpt-3.5-turbo"]) | list of dummy names to send on model endpoint requests | +| dummy_model_names | list[String] (["gpt-3.5-turbo"]) | List of dummy names to send on model endpoint requests | | model_name | String (None) | Folder name of a model to load. The below parameters will not apply unless this is filled out. | | use_as_default | list[String] ([]) | Keys to use by default when loading models. For example, putting `cache_mode` in this array will make every model load with that value unless specified by the API request.

Note: Also applies to the `draft` sub-block | | max_seq_len | Float (None) | Maximum sequence length of the model. Uses the value from config.json if not specified here. Also called the max context length. | @@ -95,7 +95,7 @@ Note: Sub-block of Mode Options. Same rules apply. | Config Option | Type (Default) | Description | |---------------|------------------|--------------------------------------------------------------| | lora_dir | String ("loras") | Directory to look for loras.

Note: Persisted across subsequent load requests | -| loras | list[loras] ([]) | list of lora objects to apply to the model. Each object contains a name and scaling. | +| loras | list[loras] ([]) | List of lora objects to apply to the model. Each object contains a name and scaling. | | name | String (None) | Folder name of a lora to load.

Note: An element of the `loras` key | | scaling | Float (1.0) | "Weight" to apply the lora on the parent model. For example, applying a lora with 0.9 scaling will lower the amount of application on the parent model.

Note: An element of the `loras` key | diff --git a/docs/06.-Sharing.md b/docs/06.-Sharing.md index a9f69559..86288d80 100644 --- a/docs/06.-Sharing.md +++ b/docs/06.-Sharing.md @@ -28,7 +28,7 @@ Tailscale is a product that uses the WireGuard protocol to provide a mesh networ > This is not a method for exposing your TabbyAPI instance to the world. If you want that, use the other two services. To get started: -1. set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine. +1. Set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine. 2. Sign up and get started on [Tailscale's website](https://tailscale.com/), then install the client. 3. Connect to your tailscale account on both your host and client machine. 4. Select the Tailscale icon (usually in the system tray) and get the IP of your host device. This is usually identified by the hostname. diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py index bb64a48a..8f4e7a4e 100644 --- a/endpoints/OAI/router.py +++ b/endpoints/OAI/router.py @@ -72,7 +72,7 @@ async def completion_request( disable_request_streaming = config.developer.disable_request_streaming - # set an empty JSON schema if the request wants a JSON response + # Set an empty JSON schema if the request wants a JSON response if data.response_format.type == "json": data.json_schema = {"type": "object"} @@ -125,7 +125,7 @@ async def chat_completion_request( prompt, embeddings = await apply_chat_template(data) - # set an empty JSON schema if the request wants a JSON response + # Set an empty JSON schema if the request wants a JSON response if data.response_format.type == "json": data.json_schema = {"type": "object"} diff --git a/endpoints/OAI/types/embedding.py b/endpoints/OAI/types/embedding.py index 7eaefcf9..abb68fd7 100644 --- a/endpoints/OAI/types/embedding.py +++ b/endpoints/OAI/types/embedding.py @@ -9,7 +9,7 @@ class UsageInfo(BaseModel): class EmbeddingsRequest(BaseModel): input: str | list[str] = Field( - ..., description="list of input texts to generate embeddings for." + ..., description="List of input texts to generate embeddings for." ) encoding_format: str = Field( "float", @@ -25,7 +25,7 @@ class EmbeddingsRequest(BaseModel): class EmbeddingObject(BaseModel): object: str = Field("embedding", description="Type of the object.") embedding: list[float] | str = Field( - ..., description="Embedding values as a List of floats." + ..., description="Embedding values as a list of floats." ) index: int = Field( ..., description="Index of the input text corresponding to the embedding." diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index 819a409c..157c96ba 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -69,7 +69,7 @@ def _create_response( logprob_response = ChatCompletionLogprobs(content=collected_token_probs) - # set finish reason + # Set finish reason if message.tool_calls: finish_reason = "tool_calls" else: diff --git a/endpoints/core/router.py b/endpoints/core/router.py index 3b362a73..d69d683f 100644 --- a/endpoints/core/router.py +++ b/endpoints/core/router.py @@ -274,7 +274,7 @@ async def load_lora(data: LoraLoadRequest) -> LoraLoadResponse: if not data.loras: error_message = handle_request_error( - "list of loras to load is not found.", + "List of loras to load is not found.", exc_info=False, ).error.message @@ -546,7 +546,7 @@ async def unload_template(): @router.get("/v1/sampling/override/list", dependencies=[Depends(check_api_key)]) async def list_sampler_overrides(request: Request) -> SamplerOverrideListResponse: """ - list all currently applied sampler overrides. + List all currently applied sampler overrides. Requires an admin key to see all override presets. """ diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 0f4252fb..ad0d8a85 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -123,7 +123,7 @@ class ModelLoadRequest(BaseModel): class EmbeddingModelLoadRequest(BaseModel): embedding_model_name: str - # set default from the config + # Set default from the config embeddings_device: str | None = Field(config.embeddings.embeddings_device) diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py index fa11186e..f2b39850 100644 --- a/endpoints/core/utils/model.py +++ b/endpoints/core/utils/model.py @@ -82,7 +82,7 @@ async def stream_model_load( # Get trimmed load data load_data = data.model_dump(exclude_none=True) - # set the draft model directory + # Set the draft model directory load_data.setdefault("draft_model", {})["draft_model_dir"] = ( config.draft_model.draft_model_dir ) diff --git a/main.py b/main.py index 1409c656..3f497901 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ """The main tabbyAPI module. Contains the FastAPI server and endpoints.""" -# set this env var for cuda malloc async before torch is initalized +# Set this env var for cuda malloc async before torch is initalized import os os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync" @@ -91,7 +91,7 @@ async def entrypoint_async(): gen_logging.broadcast_status() - # set sampler parameter overrides if provided + # Set sampler parameter overrides if provided sampling_override_preset = config.sampling.override_preset if sampling_override_preset: try: @@ -108,7 +108,7 @@ def entrypoint( ): setup_logger() - # set up signal aborting + # Set up signal aborting signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) @@ -117,7 +117,7 @@ def entrypoint( else: from uvloop import install - # set loop event policy + # Set loop event policy install() # Parse and override config from args @@ -157,7 +157,7 @@ def entrypoint( raise SystemExit(install_message) - # set the process priority + # Set the process priority if config.developer.realtime_process_priority: import psutil diff --git a/start.py b/start.py index 20c4d489..4811c9ac 100644 --- a/start.py +++ b/start.py @@ -203,10 +203,10 @@ def run_pip(command: list[str]): "Getting things ready..." ) - # set variables that rely on start options + # Set variables that rely on start options first_run = not start_options.get("first_run_done") - # set gpu_lib for dependency install + # Set gpu_lib for dependency install if args.gpu_lib: print("Overriding GPU lib name from args.") gpu_lib = args.gpu_lib