From 5cc50bd88697290d99edc075d856bbba97d43cfc Mon Sep 17 00:00:00 2001
From: AlpinDale <alpindale@gmail.com>
Date: Fri, 31 Oct 2025 20:49:26 +0430
Subject: [PATCH 1/3] [refactor] chore: migrate from
 `typing.{List,Union,Optional,Dict,Set}` to builtin syntax

---
 backends/base_model_container.py          |  34 +++---
 backends/exllamav2/grammar.py             |   7 +-
 backends/exllamav2/model.py               |  87 +++++++-------
 backends/exllamav3/model.py               |  65 +++++------
 backends/exllamav3/sampler.py             |   3 +-
 backends/infinity/model.py                |   7 +-
 colab/TabbyAPI_Colab_Example.ipynb        |   6 +-
 common/args.py                            |   3 +-
 common/auth.py                            |   5 +-
 common/config_models.py                   | 132 +++++++++++-----------
 common/downloader.py                      |  23 ++--
 common/gen_logging.py                     |   5 +-
 common/health.py                          |   3 +-
 common/model.py                           |   7 +-
 common/multimodal.py                      |   3 +-
 common/networking.py                      |   3 +-
 common/sampling.py                        |  93 ++++++++-------
 common/tabby_config.py                    |   5 +-
 common/templating.py                      |   9 +-
 common/transformers_utils.py              |  17 ++-
 common/utils.py                           |  18 +--
 config_sample.yml                         |  14 +--
 docker/Dockerfile                         |   4 +-
 docs/01.-Getting-Started.md               |   2 +-
 docs/02.-Server-options.md                |  20 ++--
 docs/06.-Sharing.md                       |   2 +-
 endpoints/Kobold/types/generation.py      |   9 +-
 endpoints/Kobold/types/token.py           |   3 +-
 endpoints/OAI/router.py                   |   4 +-
 endpoints/OAI/types/chat_completion.py    |  56 ++++-----
 endpoints/OAI/types/common.py             |  31 +++--
 endpoints/OAI/types/completion.py         |  19 ++--
 endpoints/OAI/types/embedding.py          |  16 ++-
 endpoints/OAI/types/tools.py              |   4 +-
 endpoints/OAI/utils/chat_completion.py    |  25 ++--
 endpoints/OAI/utils/completion.py         |  11 +-
 endpoints/OAI/utils/tools.py              |  13 +--
 endpoints/core/router.py                  |   7 +-
 endpoints/core/types/download.py          |  15 ++-
 endpoints/core/types/health.py            |   2 +-
 endpoints/core/types/lora.py              |  13 +--
 endpoints/core/types/model.py             |  76 +++++++------
 endpoints/core/types/sampler_overrides.py |   7 +-
 endpoints/core/types/template.py          |   3 +-
 endpoints/core/types/token.py             |   7 +-
 endpoints/core/utils/model.py             |   5 +-
 endpoints/server.py                       |   3 +-
 main.py                                   |  15 ++-
 start.py                                  |   7 +-
 49 files changed, 446 insertions(+), 482 deletions(-)

diff --git a/backends/base_model_container.py b/backends/base_model_container.py
index 96393aba..cc0c5c1e 100644
--- a/backends/base_model_container.py
+++ b/backends/base_model_container.py
@@ -2,13 +2,7 @@
 import asyncio
 import pathlib
 from loguru import logger
-from typing import (
-    Any,
-    AsyncIterator,
-    Dict,
-    List,
-    Optional,
-)
+from typing import Any, AsyncIterator
 from common.multimodal import MultimodalEmbeddingWrapper
 from common.sampling import BaseSamplerRequest
 from common.templating import PromptTemplate
@@ -21,7 +15,7 @@ class BaseModelContainer(abc.ABC):
 
     # Exposed model information
     model_dir: pathlib.Path = pathlib.Path("models")
-    prompt_template: Optional[PromptTemplate] = None
+    prompt_template: PromptTemplate | None = None
 
     # HF Model instance
     hf_model: HFModel
@@ -34,7 +28,7 @@ class BaseModelContainer(abc.ABC):
     # The bool is a master switch for accepting requests
     # The lock keeps load tasks sequential
     # The condition notifies any waiting tasks
-    active_job_ids: Dict[str, Any] = {}
+    active_job_ids: dict[str, Any] = {}
     loaded: bool = False
     load_lock: asyncio.Lock
     load_condition: asyncio.Condition
@@ -98,7 +92,7 @@ async def unload(self, loras_only: bool = False, **kwargs):
         pass
 
     @abc.abstractmethod
-    def encode_tokens(self, text: str, **kwargs) -> List[int]:
+    def encode_tokens(self, text: str, **kwargs) -> list[int]:
         """
         Encodes a string of text into a list of token IDs.
 
@@ -113,7 +107,7 @@ def encode_tokens(self, text: str, **kwargs) -> List[int]:
         pass
 
     @abc.abstractmethod
-    def decode_tokens(self, ids: List[int], **kwargs) -> str:
+    def decode_tokens(self, ids: list[int], **kwargs) -> str:
         """
         Decodes a list of token IDs back into a string.
 
@@ -128,7 +122,7 @@ def decode_tokens(self, ids: List[int], **kwargs) -> str:
         pass
 
     @abc.abstractmethod
-    def get_special_tokens(self) -> Dict[str, Any]:
+    def get_special_tokens(self) -> dict[str, Any]:
         """
         Gets special tokens used by the model/tokenizer.
 
@@ -164,7 +158,7 @@ async def wait_for_jobs(self, skip_wait: bool = False):
     # Optional methods
     async def load_loras(
         self, lora_directory: pathlib.Path, **kwargs
-    ) -> Dict[str, List[str]]:
+    ) -> dict[str, list[str]]:
         """
         Loads LoRA adapters. Base implementation does nothing or raises error.
 
@@ -184,7 +178,7 @@ async def load_loras(
             ],
         }
 
-    def get_loras(self) -> List[Any]:
+    def get_loras(self) -> list[Any]:
         """
         Gets the currently loaded LoRA adapters. Base implementation returns empty list.
 
@@ -200,9 +194,9 @@ async def generate(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
-    ) -> Dict[str, Any]:
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
+    ) -> dict[str, Any]:
         """
         Generates a complete response for a given prompt and parameters.
 
@@ -225,9 +219,9 @@ async def stream_generate(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
-    ) -> AsyncIterator[Dict[str, Any]]:
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
+    ) -> AsyncIterator[dict[str, Any]]:
         """
         Generates a response iteratively (streaming) for a given prompt.
 
diff --git a/backends/exllamav2/grammar.py b/backends/exllamav2/grammar.py
index f9bb464c..2605bec3 100644
--- a/backends/exllamav2/grammar.py
+++ b/backends/exllamav2/grammar.py
@@ -1,7 +1,6 @@
 import traceback
-import typing
 from functools import lru_cache
-from typing import List
+from typing import Any
 
 import torch
 from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
@@ -16,7 +15,7 @@
 class ExLlamaV2Grammar:
     """ExLlamaV2 class for various grammar filters/parsers."""
 
-    filters: List[ExLlamaV2Filter]
+    filters: list[ExLlamaV2Filter]
 
     def __init__(self):
         self.filters = []
@@ -123,7 +122,7 @@ def __init__(self, nonterminal: str, kbnf_string: str):
         self.kbnf_string = kbnf_string
 
     # Return the entire input string as the extracted string
-    def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
+    def extract(self, input_str: str) -> tuple[str, Any] | None:
         return "", input_str
 
     @property
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index ae71b00f..903b4929 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -25,7 +25,6 @@
 )
 from itertools import zip_longest
 from loguru import logger
-from typing import Dict, List, Optional
 
 from backends.base_model_container import BaseModelContainer
 from backends.exllamav2.grammar import (
@@ -58,45 +57,45 @@ class ExllamaV2Container(BaseModelContainer):
     # Model directories
     model_dir: pathlib.Path = pathlib.Path("models")
     draft_model_dir: pathlib.Path = pathlib.Path("models")
-    prompt_template: Optional[PromptTemplate] = None
+    prompt_template: PromptTemplate | None = None
 
     # HF model instance
     hf_model: HFModel
 
     # Exl2 vars
-    config: Optional[ExLlamaV2Config] = None
-    model: Optional[ExLlamaV2] = None
-    cache: Optional[ExLlamaV2Cache] = None
-    tokenizer: Optional[ExLlamaV2Tokenizer] = None
-    generator: Optional[ExLlamaV2DynamicGeneratorAsync] = None
-    prompt_template: Optional[PromptTemplate] = None
+    config: ExLlamaV2Config | None = None
+    model: ExLlamaV2 | None = None
+    cache: ExLlamaV2Cache | None = None
+    tokenizer: ExLlamaV2Tokenizer | None = None
+    generator: ExLlamaV2DynamicGeneratorAsync | None = None
+    prompt_template: PromptTemplate | None = None
     paged: bool = True
 
     # Draft model vars
     use_draft_model: bool = False
-    draft_config: Optional[ExLlamaV2Config] = None
-    draft_model: Optional[ExLlamaV2] = None
-    draft_cache: Optional[ExLlamaV2Cache] = None
+    draft_config: ExLlamaV2Config | None = None
+    draft_model: ExLlamaV2 | None = None
+    draft_cache: ExLlamaV2Cache | None = None
 
     # Internal config vars
     cache_size: int = None
     cache_mode: str = "FP16"
     draft_cache_mode: str = "FP16"
-    max_batch_size: Optional[int] = None
+    max_batch_size: int | None = None
 
     # GPU split vars
-    gpu_split: List[float] = []
-    draft_gpu_split: List[float] = []
+    gpu_split: list[float] = []
+    draft_gpu_split: list[float] = []
     gpu_split_auto: bool = True
-    autosplit_reserve: List[float] = [96 * 1024**2]
+    autosplit_reserve: list[float] = [96 * 1024**2]
     use_tp: bool = False
 
     # Vision vars
     use_vision: bool = False
-    vision_model: Optional[ExLlamaV2VisionTower] = None
+    vision_model: ExLlamaV2VisionTower | None = None
 
     # Load synchronization
-    active_job_ids: Dict[str, Optional[ExLlamaV2DynamicJobAsync]] = {}
+    active_job_ids: dict[str, ExLlamaV2DynamicJobAsync | None] = {}
     loaded: bool = False
     load_lock: asyncio.Lock = asyncio.Lock()
     load_condition: asyncio.Condition = asyncio.Condition()
@@ -130,7 +129,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         # Check if the model arch is compatible with various exl2 features
         self.config.arch_compat_overrides()
 
-        # Set vision state and error if vision isn't supported on the current model
+        # set vision state and error if vision isn't supported on the current model
         self.use_vision = unwrap(kwargs.get("vision"), False)
         if self.use_vision and not self.config.vision_model_type:
             raise ValueError(
@@ -185,12 +184,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         gpu_split = unwrap(kwargs.get("gpu_split"), [])
         gpu_device_list = list(range(0, gpu_count))
 
-        # Set GPU split options
+        # set GPU split options
         if gpu_count == 1:
             self.gpu_split_auto = False
             logger.info("Disabling GPU split because one GPU is in use.")
         else:
-            # Set tensor parallel
+            # set tensor parallel
             if use_tp:
                 self.use_tp = True
 
@@ -233,7 +232,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         # Hardcode max output length to 16
         self.config.max_output_len = 16
 
-        # Set max batch size to the config override
+        # set max batch size to the config override
         self.max_batch_size = unwrap(kwargs.get("max_batch_size"))
 
         # Check whether the user's configuration supports flash/paged attention
@@ -262,7 +261,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         # Grab user-set max seq len
         user_max_seq_len = kwargs.get("max_seq_len")
 
-        # Set k/v cache size
+        # set k/v cache size
         # cache_size is only relevant when paged mode is enabled
         if self.paged:
             user_cache_size = coalesce(kwargs.get("cache_size"), user_max_seq_len, 4096)
@@ -273,7 +272,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
                 user_max_seq_len, min(hf_model.hf_config.max_position_embeddings, 4096)
             )
 
-        # Set the rope scale
+        # set the rope scale
         self.config.scale_pos_emb = unwrap(
             kwargs.get("rope_scale"), self.config.scale_pos_emb
         )
@@ -322,7 +321,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         self.config.max_input_len = chunk_size
         self.config.max_attention_size = chunk_size**2
 
-        # Set user-configured draft model values
+        # set user-configured draft model values
         if self.use_draft_model:
             self.draft_config.max_seq_len = self.config.max_seq_len
 
@@ -330,7 +329,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
                 draft_args.get("draft_rope_scale"), 1.0
             )
 
-            # Set draft rope alpha. Follows same behavior as model rope alpha.
+            # set draft rope alpha. Follows same behavior as model rope alpha.
             # Use the max_position_embeddings of the model
             draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto")
             if draft_rope_alpha == "auto":
@@ -341,7 +340,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
             else:
                 self.draft_config.scale_alpha_value = draft_rope_alpha
 
-            # Set draft cache mode
+            # set draft cache mode
             self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
 
             # Catch exllamav3 draft_cache_mode
@@ -750,9 +749,9 @@ async def load_loras(self, lora_directory: pathlib.Path, **kwargs):
             # Wait for existing generation jobs to finish
             await self.wait_for_jobs(kwargs.get("skip_wait"))
 
-            loras_to_load: List[ExLlamaV2Lora] = []
-            success: List[str] = []
-            failure: List[str] = []
+            loras_to_load: list[ExLlamaV2Lora] = []
+            success: list[str] = []
+            failure: list[str] = []
 
             for lora in loras:
                 lora_name = lora.get("name")
@@ -836,7 +835,7 @@ async def unload(self, loras_only: bool = False, **kwargs):
                     await self.generator.close()
                     self.generator = None
 
-                # Set all model state variables to False
+                # set all model state variables to False
                 self.loaded = False
 
             gc.collect()
@@ -869,7 +868,7 @@ def encode_tokens(self, text: str, **kwargs):
             .tolist()
         )
 
-    def decode_tokens(self, ids: List[int], **kwargs):
+    def decode_tokens(self, ids: list[int], **kwargs):
         """Wrapper to decode tokens from a list of IDs"""
 
         ids = torch.tensor([ids])
@@ -908,8 +907,8 @@ async def generate(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
     ):
         """Generate a response to a prompt."""
         generations = []
@@ -969,8 +968,8 @@ async def stream_generate(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
     ):
         try:
             # Wait for load lock to be freed before processing
@@ -1136,15 +1135,15 @@ def assign_gen_params(
                 "top_k, top_p, and typical to 1.0, 1, 0, and 0."
             )
 
-        # Set banned tokens
+        # set banned tokens
         if params.banned_tokens:
             gen_settings.disallow_tokens(self.tokenizer, params.banned_tokens)
 
-        # Set allowed tokens
+        # set allowed tokens
         if params.allowed_tokens:
             gen_settings.allow_tokens(self.tokenizer, params.allowed_tokens)
 
-        # Set logit bias
+        # set logit bias
         if params.logit_bias:
             # Create a vocab tensor if it doesn't exist for token biasing
             if gen_settings.token_bias is None:
@@ -1242,8 +1241,8 @@ async def generate_gen(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
     ):
         """
         Create generator function for prompt completion.
@@ -1261,7 +1260,7 @@ async def generate_gen(
             grammar_handler,
         )
 
-        # Set banned strings
+        # set banned strings
         banned_strings = params.banned_strings
         if banned_strings and len(grammar_handler.filters) > 0:
             logger.warning(
@@ -1271,7 +1270,7 @@ async def generate_gen(
 
             banned_strings = []
 
-        # Set CFG scale and negative prompt
+        # set CFG scale and negative prompt
         cfg_scale = params.cfg_scale
         negative_prompt = None
         if cfg_scale not in [None, 1.0]:
@@ -1301,7 +1300,7 @@ async def generate_gen(
         stop_conditions = params.stop
         ban_eos_token = params.ban_eos_token
 
-        # Set add_bos_token for generation
+        # set add_bos_token for generation
         add_bos_token = unwrap(params.add_bos_token, self.hf_model.add_bos_token())
 
         # Fetch EOS tokens from the HF model if they exist
@@ -1309,7 +1308,7 @@ async def generate_gen(
 
         # Ban the EOS token if specified. If not, append to stop conditions
         # as well.
-        # Set this below logging to avoid polluting the stop strings array
+        # set this below logging to avoid polluting the stop strings array
         if ban_eos_token:
             gen_settings.disallow_tokens(self.tokenizer, eos_tokens)
         else:
diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index d385de7c..4b369c7f 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -7,9 +7,6 @@
 from typing import (
     Any,
     AsyncIterator,
-    Dict,
-    List,
-    Optional,
 )
 
 from exllamav3 import (
@@ -49,7 +46,7 @@ class ExllamaV3Container(BaseModelContainer):
 
     # Exposed model information
     model_dir: pathlib.Path = pathlib.Path("models")
-    prompt_template: Optional[PromptTemplate] = None
+    prompt_template: PromptTemplate | None = None
 
     # HF Model instance
     hf_model: HFModel
@@ -58,26 +55,26 @@ class ExllamaV3Container(BaseModelContainer):
     # The bool is a master switch for accepting requests
     # The lock keeps load tasks sequential
     # The condition notifies any waiting tasks
-    active_job_ids: Dict[str, Any] = {}
+    active_job_ids: dict[str, Any] = {}
     loaded: bool = False
     load_lock: asyncio.Lock = asyncio.Lock()
     load_condition: asyncio.Condition = asyncio.Condition()
 
     # Exl3 vars
-    model: Optional[Model] = None
-    cache: Optional[Cache] = None
-    draft_model: Optional[Model] = None
-    draft_cache: Optional[Cache] = None
-    tokenizer: Optional[Tokenizer] = None
-    config: Optional[Config] = None
-    draft_config: Optional[Config] = None
-    generator: Optional[AsyncGenerator] = None
-    vision_model: Optional[Model] = None
+    model: Model | None = None
+    cache: Cache | None = None
+    draft_model: Model | None = None
+    draft_cache: Cache | None = None
+    tokenizer: Tokenizer | None = None
+    config: Config | None = None
+    draft_config: Config | None = None
+    generator: AsyncGenerator | None = None
+    vision_model: Model | None = None
 
     # Class-specific vars
-    gpu_split: Optional[List[float]] = None
+    gpu_split: list[float] | None = None
     gpu_split_auto: bool = True
-    autosplit_reserve: Optional[List[float]] = [96 / 1024]
+    autosplit_reserve: list[float] | None = [96 / 1024]
     use_tp: bool = False
     tp_backend: str = "native"
     max_seq_len: int = 4096
@@ -85,8 +82,8 @@ class ExllamaV3Container(BaseModelContainer):
     cache_mode: str = "FP16"
     draft_cache_mode: str = "FP16"
     chunk_size: int = 2048
-    max_rq_tokens: Optional[int] = 2048
-    max_batch_size: Optional[int] = None
+    max_rq_tokens: int | None = 2048
+    max_batch_size: int | None = None
 
     # Required methods
     @classmethod
@@ -160,12 +157,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         gpu_device_list = list(range(0, gpu_count))
         use_tp = unwrap(kwargs.get("tensor_parallel"), False)
 
-        # Set GPU split options
+        # set GPU split options
         if gpu_count == 1:
             self.gpu_split_auto = False
             logger.info("Disabling GPU split because one GPU is in use.")
         else:
-            # Set tensor parallel
+            # set tensor parallel
             if use_tp:
                 self.use_tp = True
                 tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native")
@@ -182,7 +179,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
                 # TP has its own autosplit loader
                 self.gpu_split_auto = False
 
-            # Set GPU split options
+            # set GPU split options
             # Enable manual GPU split if provided
             if gpu_split:
                 self.gpu_split_auto = False
@@ -238,7 +235,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
 
         # Draft cache
         if self.use_draft_model:
-            # Set draft cache mode
+            # set draft cache mode
             self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
             self.draft_cache = self.create_cache(
                 self.draft_cache_mode, self.draft_model
@@ -579,7 +576,7 @@ async def unload(self, loras_only: bool = False, **kwargs):
                 async with self.load_condition:
                     self.load_condition.notify_all()
 
-    def encode_tokens(self, text: str, **kwargs) -> List[int]:
+    def encode_tokens(self, text: str, **kwargs) -> list[int]:
         """
         Encodes a string of text into a list of token IDs.
 
@@ -607,7 +604,7 @@ def encode_tokens(self, text: str, **kwargs) -> List[int]:
             .tolist()
         )
 
-    def decode_tokens(self, ids: List[int], **kwargs) -> str:
+    def decode_tokens(self, ids: list[int], **kwargs) -> str:
         """
         Decodes a list of token IDs back into a string.
 
@@ -666,9 +663,9 @@ async def generate(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
-    ) -> Dict[str, Any]:
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
+    ) -> dict[str, Any]:
         """
         Generates a complete response for a given prompt and parameters.
 
@@ -738,9 +735,9 @@ async def stream_generate(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
-    ) -> AsyncIterator[Dict[str, Any]]:
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
+    ) -> AsyncIterator[dict[str, Any]]:
         """
         Generates a response iteratively (streaming) for a given prompt.
 
@@ -859,8 +856,8 @@ async def generate_gen(
         request_id: str,
         prompt: str,
         params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+        abort_event: asyncio.Event | None = None,
+        mm_embeddings: MultimodalEmbeddingWrapper | None = None,
     ):
         """
         Create generator function for prompt completion.
@@ -873,7 +870,7 @@ async def generate_gen(
 
         # Penalties
 
-        # Set penalty range
+        # set penalty range
         penalty_range = unwrap(params.penalty_range, self.max_seq_len)
 
         # Exl3's version of including the entire context
@@ -914,7 +911,7 @@ async def generate_gen(
             sampler_builder.temperature(params.temperature)
 
         # Build the sampler
-        # Set greedy if temperature is 0
+        # set greedy if temperature is 0
         sampler = sampler_builder.build(params.temperature == 0)
 
         # Dynamically scale penalty range to output tokens
diff --git a/backends/exllamav3/sampler.py b/backends/exllamav3/sampler.py
index 7b08a9b1..eef7d944 100644
--- a/backends/exllamav3/sampler.py
+++ b/backends/exllamav3/sampler.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass, field
-from typing import List
 from exllamav3.generator.sampler import (
     CustomSampler,
     SS_Temperature,
@@ -20,7 +19,7 @@ class ExllamaV3SamplerBuilder:
     Custom sampler chain/stack for TabbyAPI
     """
 
-    stack: List[SS_Base] = field(default_factory=list)
+    stack: list[SS_Base] = field(default_factory=list)
 
     def penalties(self, rep_p, freq_p, pres_p, penalty_range, rep_decay):
         self.stack += [
diff --git a/backends/infinity/model.py b/backends/infinity/model.py
index c131e3cb..e1dd9b1b 100644
--- a/backends/infinity/model.py
+++ b/backends/infinity/model.py
@@ -1,8 +1,9 @@
+from __future__ import annotations
+
 import gc
 import pathlib
 import torch
 from loguru import logger
-from typing import List, Optional
 
 from common.utils import unwrap
 from common.optional_dependencies import dependencies
@@ -17,7 +18,7 @@ class InfinityContainer:
     loaded: bool = False
 
     # Use a runtime type hint here
-    engine: Optional["AsyncEmbeddingEngine"] = None
+    engine: AsyncEmbeddingEngine | None = None
 
     def __init__(self, model_directory: pathlib.Path):
         self.model_dir = model_directory
@@ -49,7 +50,7 @@ async def unload(self):
 
         logger.info("Embedding model unloaded.")
 
-    async def generate(self, sentence_input: List[str]):
+    async def generate(self, sentence_input: list[str]):
         result_embeddings, usage = await self.engine.embed(sentence_input)
 
         return {"embeddings": result_embeddings, "usage": usage}
diff --git a/colab/TabbyAPI_Colab_Example.ipynb b/colab/TabbyAPI_Colab_Example.ipynb
index b8e32f0a..c23bd345 100644
--- a/colab/TabbyAPI_Colab_Example.ipynb
+++ b/colab/TabbyAPI_Colab_Example.ipynb
@@ -184,13 +184,13 @@
         "  # Only use if your model was trained on long context with rope (check config.json)\n",
         "  rope_alpha: {RopeAlpha}\n",
         "\n",
-        "  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
+        "  # Disable Flash-attention 2. set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
         "  no_flash_attention: {NoFlashAttention}\n",
         "\n",
         "  # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n",
         "  cache_mode: {CacheMode}\n",
         "\n",
-        "  # Set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n",
+        "  # set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n",
         "  # NOTE: Only works with chat completion message lists!\n",
         "  prompt_template: {PromptTemplate}\n",
         "\n",
@@ -218,7 +218,7 @@
         "    # Overrides the directory to look for loras (default: loras)\n",
         "    lora_dir: loras\n",
         "\n",
-        "    # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n",
+        "    # list of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n",
         "    loras:\n",
         "    - name: {lora}\n",
         "      scaling: {LoraScaling}\n",
diff --git a/common/args.py b/common/args.py
index a0da4f98..cd286877 100644
--- a/common/args.py
+++ b/common/args.py
@@ -1,7 +1,6 @@
 """Argparser for overriding config values"""
 
 import argparse
-from typing import Optional
 from pydantic import BaseModel
 
 from common.config_models import TabbyConfigModel
@@ -25,7 +24,7 @@ def add_field_to_group(group, field_name, field_type, field) -> None:
 
 
 def init_argparser(
-    existing_parser: Optional[argparse.ArgumentParser] = None,
+    existing_parser: argparse.ArgumentParser | None = None,
 ) -> argparse.ArgumentParser:
     """
     Initializes an argparse parser based on a Pydantic config schema.
diff --git a/common/auth.py b/common/auth.py
index b02cdd02..c986fc40 100644
--- a/common/auth.py
+++ b/common/auth.py
@@ -10,7 +10,6 @@
 from fastapi import Header, HTTPException, Request
 from pydantic import BaseModel
 from loguru import logger
-from typing import Optional
 
 from common.utils import coalesce
 
@@ -38,7 +37,7 @@ def verify_key(self, test_key: str, key_type: str):
 
 
 # Global auth constants
-AUTH_KEYS: Optional[AuthKeys] = None
+AUTH_KEYS: AuthKeys | None = None
 DISABLE_AUTH: bool = False
 
 
@@ -52,7 +51,7 @@ async def load_auth_keys(disable_from_config: bool):
     if disable_from_config:
         logger.warning(
             "Disabling authentication makes your instance vulnerable. "
-            "Set the `disable_auth` flag to False in config.yml if you "
+            "set the `disable_auth` flag to False in config.yml if you "
             "want to share this instance with others."
         )
 
diff --git a/common/config_models.py b/common/config_models.py
index 0e71734c..781acdb3 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -6,17 +6,17 @@
     PrivateAttr,
     field_validator,
 )
-from typing import List, Literal, Optional, Union
+from typing import Literal
 
 
 CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]
-CACHE_TYPE = Union[CACHE_SIZES, constr(pattern=r"^[2-8]\s*,\s*[2-8]$")]
+CACHE_TYPE = CACHE_SIZES | constr(pattern=r"^[2-8]\s*,\s*[2-8]$")
 
 
 class Metadata(BaseModel):
     """metadata model for config options"""
 
-    include_in_config: Optional[bool] = Field(True)
+    include_in_config: bool | None = Field(True)
 
 
 class BaseConfigModel(BaseModel):
@@ -29,7 +29,7 @@ class ConfigOverrideConfig(BaseConfigModel):
     """Model for overriding a provided config file."""
 
     # TODO: convert this to a pathlib.path?
-    config: Optional[str] = Field(
+    config: str | None = Field(
         None, description=("Path to an overriding config.yml file")
     )
 
@@ -39,17 +39,17 @@ class ConfigOverrideConfig(BaseConfigModel):
 class NetworkConfig(BaseConfigModel):
     """Options for networking"""
 
-    host: Optional[str] = Field(
+    host: str | None = Field(
         "127.0.0.1",
         description=(
             "The IP to host on (default: 127.0.0.1).\n"
             "Use 0.0.0.0 to expose on all network adapters."
         ),
     )
-    port: Optional[int] = Field(
+    port: int | None = Field(
         5000, description=("The port to host on (default: 5000).")
     )
-    disable_auth: Optional[bool] = Field(
+    disable_auth: bool | None = Field(
         False,
         description=(
             "Disable HTTP token authentication with requests.\n"
@@ -57,21 +57,21 @@ class NetworkConfig(BaseConfigModel):
             "Turn on this option if you are ONLY connecting from localhost."
         ),
     )
-    disable_fetch_requests: Optional[bool] = Field(
+    disable_fetch_requests: bool | None = Field(
         False,
         description=(
             "Disable fetching external content in response to requests,"
             "such as images from URLs."
         ),
     )
-    send_tracebacks: Optional[bool] = Field(
+    send_tracebacks: bool | None = Field(
         False,
         description=(
             "Send tracebacks over the API (default: False).\n"
             "NOTE: Only enable this for debug purposes."
         ),
     )
-    api_servers: Optional[List[Literal["oai", "kobold"]]] = Field(
+    api_servers: list[Literal["oai", "kobold"]] | None = Field(
         ["OAI"],
         description=(
             'Select API servers to enable (default: ["OAI"]).\n'
@@ -91,15 +91,15 @@ def api_server_validator(cls, api_servers):
 class LoggingConfig(BaseConfigModel):
     """Options for logging"""
 
-    log_prompt: Optional[bool] = Field(
+    log_prompt: bool | None = Field(
         False,
         description=("Enable prompt logging (default: False)."),
     )
-    log_generation_params: Optional[bool] = Field(
+    log_generation_params: bool | None = Field(
         False,
         description=("Enable generation parameter logging (default: False)."),
     )
-    log_requests: Optional[bool] = Field(
+    log_requests: bool | None = Field(
         False,
         description=(
             "Enable request logging (default: False).\n"
@@ -123,7 +123,7 @@ class ModelConfig(BaseConfigModel):
             "Windows users, do NOT put this path in quotes!"
         ),
     )
-    inline_model_loading: Optional[bool] = Field(
+    inline_model_loading: bool | None = Field(
         False,
         description=(
             "Allow direct loading of models "
@@ -132,7 +132,7 @@ class ModelConfig(BaseConfigModel):
             "Enable dummy models to add exceptions for invalid model names."
         ),
     )
-    use_dummy_models: Optional[bool] = Field(
+    use_dummy_models: bool | None = Field(
         False,
         description=(
             "Sends dummy model names when the models endpoint is queried. "
@@ -140,7 +140,7 @@ class ModelConfig(BaseConfigModel):
             "Enable this if the client is looking for specific OAI models.\n"
         ),
     )
-    dummy_model_names: List[str] = Field(
+    dummy_model_names: list[str] = Field(
         default=["gpt-3.5-turbo"],
         description=(
             "A list of fake model names that are sent via the /v1/models endpoint. "
@@ -148,7 +148,7 @@ class ModelConfig(BaseConfigModel):
             "Also used as bypasses for strict mode if inline_model_loading is true."
         ),
     )
-    model_name: Optional[str] = Field(
+    model_name: str | None = Field(
         None,
         description=(
             "An initial model to load.\n"
@@ -156,7 +156,7 @@ class ModelConfig(BaseConfigModel):
             "REQUIRED: This must be filled out to load a model on startup."
         ),
     )
-    use_as_default: List[str] = Field(
+    use_as_default: list[str] = Field(
         default_factory=list,
         description=(
             "Names of args to use as a fallback for API load requests (default: []).\n"
@@ -165,22 +165,22 @@ class ModelConfig(BaseConfigModel):
             "Example: ['max_seq_len', 'cache_mode']."
         ),
     )
-    backend: Optional[str] = Field(
+    backend: str | None = Field(
         None,
         description=(
             "Backend to use for this model (auto-detect if not specified)\n"
             "Options: exllamav2, exllamav3"
         ),
     )
-    max_seq_len: Optional[int] = Field(
+    max_seq_len: int | None = Field(
         None,
         description=(
             "Max sequence length (default: 4096).\n"
-            "Set to -1 to fetch from the model's config.json"
+            "set to -1 to fetch from the model's config.json"
         ),
         ge=-1,
     )
-    cache_size: Optional[int] = Field(
+    cache_size: int | None = Field(
         None,
         description=(
             "Size of the prompt cache to allocate (default: max_seq_len).\n"
@@ -190,7 +190,7 @@ class ModelConfig(BaseConfigModel):
         multiple_of=256,
         gt=0,
     )
-    cache_mode: Optional[CACHE_TYPE] = Field(
+    cache_mode: CACHE_TYPE | None = Field(
         "FP16",
         description=(
             "Enable different cache modes for VRAM savings (default: FP16).\n"
@@ -199,7 +199,7 @@ class ModelConfig(BaseConfigModel):
             "are integers from 2-8 (i.e. 8,8)."
         ),
     )
-    tensor_parallel: Optional[bool] = Field(
+    tensor_parallel: bool | None = Field(
         False,
         description=(
             "Load model with tensor parallelism (default: False).\n"
@@ -207,7 +207,7 @@ class ModelConfig(BaseConfigModel):
             "This ignores the gpu_split_auto value."
         ),
     )
-    tensor_parallel_backend: Optional[str] = Field(
+    tensor_parallel_backend: str | None = Field(
         "native",
         description=(
             "Sets a backend type for tensor parallelism. (default: native).\n"
@@ -216,28 +216,28 @@ class ModelConfig(BaseConfigModel):
             "NCCL is recommended for NVLink."
         ),
     )
-    gpu_split_auto: Optional[bool] = Field(
+    gpu_split_auto: bool | None = Field(
         True,
         description=(
             "Automatically allocate resources to GPUs (default: True).\n"
             "Not parsed for single GPU users."
         ),
     )
-    autosplit_reserve: List[float] = Field(
+    autosplit_reserve: list[float] = Field(
         [96],
         description=(
             "Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"
             "Represented as an array of MB per GPU."
         ),
     )
-    gpu_split: List[float] = Field(
+    gpu_split: list[float] = Field(
         default_factory=list,
         description=(
             "An integer array of GBs of VRAM to split between GPUs (default: []).\n"
             "Used with tensor parallelism."
         ),
     )
-    rope_scale: Optional[float] = Field(
+    rope_scale: float | None = Field(
         1.0,
         description=(
             "Rope scale (default: 1.0).\n"
@@ -246,16 +246,16 @@ class ModelConfig(BaseConfigModel):
             "Leave blank to pull the value from the model."
         ),
     )
-    rope_alpha: Optional[Union[float, Literal["auto"]]] = Field(
+    rope_alpha: float | Literal["auto"] | None = Field(
         None,
         description=(
             "Rope alpha (default: None).\n"
-            'Same as alpha_value. Set to "auto" to auto-calculate.\n'
+            'Same as alpha_value. set to "auto" to auto-calculate.\n'
             "Leaving this value blank will either pull from the model "
             "or auto-calculate."
         ),
     )
-    chunk_size: Optional[int] = Field(
+    chunk_size: int | None = Field(
         2048,
         description=(
             "Chunk size for prompt ingestion (default: 2048).\n"
@@ -265,7 +265,7 @@ class ModelConfig(BaseConfigModel):
         ),
         gt=0,
     )
-    output_chunking: Optional[bool] = Field(
+    output_chunking: bool | None = Field(
         True,
         description=(
             "Use output chunking (default: True)\n"
@@ -274,27 +274,27 @@ class ModelConfig(BaseConfigModel):
             "Used by EXL3 models only.\n"
         ),
     )
-    max_batch_size: Optional[int] = Field(
+    max_batch_size: int | None = Field(
         None,
         description=(
-            "Set the maximum number of prompts to process at one time "
+            "set the maximum number of prompts to process at one time "
             "(default: None/Automatic).\n"
             "Automatically calculated if left blank.\n"
             "NOTE: Only available for Nvidia ampere (30 series) and above GPUs."
         ),
         ge=1,
     )
-    prompt_template: Optional[str] = Field(
+    prompt_template: str | None = Field(
         None,
         description=(
-            "Set the prompt template for this model. (default: None)\n"
+            "set the prompt template for this model. (default: None)\n"
             "If empty, attempts to look for the model's chat template.\n"
             "If a model contains multiple templates in its tokenizer_config.json,\n"
             "set prompt_template to the name of the template you want to use.\n"
             "NOTE: Only works with chat completion message lists!"
         ),
     )
-    vision: Optional[bool] = Field(
+    vision: bool | None = Field(
         False,
         description=(
             "Enables vision support if the model supports it. (default: False)"
@@ -312,18 +312,18 @@ class DraftModelConfig(BaseConfigModel):
     """
 
     # TODO: convert this to a pathlib.path?
-    draft_model_dir: Optional[str] = Field(
+    draft_model_dir: str | None = Field(
         "models",
         description=("Directory to look for draft models (default: models)"),
     )
-    draft_model_name: Optional[str] = Field(
+    draft_model_name: str | None = Field(
         None,
         description=(
             "An initial draft model to load.\n"
             "Ensure the model is in the model directory."
         ),
     )
-    draft_rope_scale: Optional[float] = Field(
+    draft_rope_scale: float | None = Field(
         1.0,
         description=(
             "Rope scale for draft models (default: 1.0).\n"
@@ -331,23 +331,23 @@ class DraftModelConfig(BaseConfigModel):
             "Use if the draft model was trained on long context with rope."
         ),
     )
-    draft_rope_alpha: Optional[float] = Field(
+    draft_rope_alpha: float | None = Field(
         None,
         description=(
             "Rope alpha for draft models (default: None).\n"
-            'Same as alpha_value. Set to "auto" to auto-calculate.\n'
+            'Same as alpha_value. set to "auto" to auto-calculate.\n'
             "Leaving this value blank will either pull from the model "
             "or auto-calculate."
         ),
     )
-    draft_cache_mode: Optional[CACHE_SIZES] = Field(
+    draft_cache_mode: CACHE_SIZES | None = Field(
         "FP16",
         description=(
             "Cache mode for draft models to save VRAM (default: FP16).\n"
             f"Possible values: {str(CACHE_SIZES)[15:-1]}."
         ),
     )
-    draft_gpu_split: List[float] = Field(
+    draft_gpu_split: list[float] = Field(
         default_factory=list,
         description=(
             "An integer array of GBs of VRAM to split between GPUs (default: []).\n"
@@ -359,7 +359,7 @@ class DraftModelConfig(BaseConfigModel):
 class SamplingConfig(BaseConfigModel):
     """Options for Sampling"""
 
-    override_preset: Optional[str] = Field(
+    override_preset: str | None = Field(
         None,
         description=(
             "Select a sampler override preset (default: None).\n"
@@ -376,7 +376,7 @@ class SamplingConfig(BaseConfigModel):
 class LoraInstanceModel(BaseConfigModel):
     """Model representing an instance of a Lora."""
 
-    name: Optional[str] = None
+    name: str | None = None
     scaling: float = Field(1.0, ge=0)
 
 
@@ -384,13 +384,13 @@ class LoraConfig(BaseConfigModel):
     """Options for Loras"""
 
     # TODO: convert this to a pathlib.path?
-    lora_dir: Optional[str] = Field(
+    lora_dir: str | None = Field(
         "loras", description=("Directory to look for LoRAs (default: loras).")
     )
-    loras: Optional[List[LoraInstanceModel]] = Field(
+    loras: list[LoraInstanceModel] | None = Field(
         None,
         description=(
-            "List of LoRAs to load and associated scaling factors "
+            "list of LoRAs to load and associated scaling factors "
             "(default scale: 1.0).\n"
             "For the YAML file, add each entry as a YAML list:\n"
             "- name: lora1\n"
@@ -407,11 +407,11 @@ class EmbeddingsConfig(BaseConfigModel):
     """
 
     # TODO: convert this to a pathlib.path?
-    embedding_model_dir: Optional[str] = Field(
+    embedding_model_dir: str | None = Field(
         "models",
         description=("Directory to look for embedding models (default: models)."),
     )
-    embeddings_device: Optional[Literal["cpu", "auto", "cuda"]] = Field(
+    embeddings_device: Literal["cpu", "auto", "cuda"] | None = Field(
         "cpu",
         description=(
             "Device to load embedding models on (default: cpu).\n"
@@ -420,7 +420,7 @@ class EmbeddingsConfig(BaseConfigModel):
             "If using an AMD GPU, set this value to 'cuda'."
         ),
     )
-    embedding_model_name: Optional[str] = Field(
+    embedding_model_name: str | None = Field(
         None,
         description=("An initial embedding model to load on the infinity backend."),
     )
@@ -429,7 +429,7 @@ class EmbeddingsConfig(BaseConfigModel):
 class DeveloperConfig(BaseConfigModel):
     """Options for development and experimentation"""
 
-    unsafe_launch: Optional[bool] = Field(
+    unsafe_launch: bool | None = Field(
         False,
         description=(
             "Skip Exllamav2 version check (default: False).\n"
@@ -437,13 +437,13 @@ class DeveloperConfig(BaseConfigModel):
             "than enabling this flag."
         ),
     )
-    disable_request_streaming: Optional[bool] = Field(
+    disable_request_streaming: bool | None = Field(
         False, description=("Disable API request streaming (default: False).")
     )
-    realtime_process_priority: Optional[bool] = Field(
+    realtime_process_priority: bool | None = Field(
         False,
         description=(
-            "Set process to use a higher priority.\n"
+            "set process to use a higher priority.\n"
             "For realtime process priority, run as administrator or sudo.\n"
             "Otherwise, the priority will be set to high."
         ),
@@ -453,27 +453,27 @@ class DeveloperConfig(BaseConfigModel):
 class TabbyConfigModel(BaseModel):
     """Base model for a TabbyConfig."""
 
-    config: Optional[ConfigOverrideConfig] = Field(
+    config: ConfigOverrideConfig | None = Field(
         default_factory=ConfigOverrideConfig.model_construct
     )
-    network: Optional[NetworkConfig] = Field(
+    network: NetworkConfig | None = Field(
         default_factory=NetworkConfig.model_construct
     )
-    logging: Optional[LoggingConfig] = Field(
+    logging: LoggingConfig | None = Field(
         default_factory=LoggingConfig.model_construct
     )
-    model: Optional[ModelConfig] = Field(default_factory=ModelConfig.model_construct)
-    draft_model: Optional[DraftModelConfig] = Field(
+    model: ModelConfig | None = Field(default_factory=ModelConfig.model_construct)
+    draft_model: DraftModelConfig | None = Field(
         default_factory=DraftModelConfig.model_construct
     )
-    lora: Optional[LoraConfig] = Field(default_factory=LoraConfig.model_construct)
-    embeddings: Optional[EmbeddingsConfig] = Field(
+    lora: LoraConfig | None = Field(default_factory=LoraConfig.model_construct)
+    embeddings: EmbeddingsConfig | None = Field(
         default_factory=EmbeddingsConfig.model_construct
     )
-    sampling: Optional[SamplingConfig] = Field(
+    sampling: SamplingConfig | None = Field(
         default_factory=SamplingConfig.model_construct
     )
-    developer: Optional[DeveloperConfig] = Field(
+    developer: DeveloperConfig | None = Field(
         default_factory=DeveloperConfig.model_construct
     )
 
diff --git a/common/downloader.py b/common/downloader.py
index 8307bbcd..8f3155f2 100644
--- a/common/downloader.py
+++ b/common/downloader.py
@@ -10,7 +10,6 @@
 from fnmatch import fnmatch
 from loguru import logger
 from rich.progress import Progress
-from typing import List, Optional
 
 from common.logger import get_progress_bar
 from common.tabby_config import config
@@ -27,7 +26,7 @@ class RepoItem:
 async def _download_file(
     session: aiohttp.ClientSession,
     repo_item: RepoItem,
-    token: Optional[str],
+    token: str | None,
     download_path: pathlib.Path,
     chunk_limit: int,
     progress: Progress,
@@ -92,7 +91,7 @@ def _get_repo_info(repo_id, revision, token):
     ]
 
 
-def _get_download_folder(repo_id: str, repo_type: str, folder_name: Optional[str]):
+def _get_download_folder(repo_id: str, repo_type: str, folder_name: str | None):
     """Gets the download folder for the repo."""
 
     if repo_type == "lora":
@@ -105,7 +104,7 @@ def _get_download_folder(repo_id: str, repo_type: str, folder_name: Optional[str
 
 
 def _check_exclusions(
-    filename: str, include_patterns: List[str], exclude_patterns: List[str]
+    filename: str, include_patterns: list[str], exclude_patterns: list[str]
 ):
     include_result = any(fnmatch(filename, pattern) for pattern in include_patterns)
     exclude_result = any(fnmatch(filename, pattern) for pattern in exclude_patterns)
@@ -115,14 +114,14 @@ def _check_exclusions(
 
 async def hf_repo_download(
     repo_id: str,
-    folder_name: Optional[str],
-    revision: Optional[str],
-    token: Optional[str],
-    include: Optional[List[str]],
-    exclude: Optional[List[str]],
-    chunk_limit: Optional[float] = None,
-    timeout: Optional[int] = None,
-    repo_type: Optional[str] = "model",
+    folder_name: str | None,
+    revision: str | None,
+    token: str | None,
+    include: list[str] | None,
+    exclude: list[str] | None,
+    chunk_limit: float | None = None,
+    timeout: int | None = None,
+    repo_type: str | None = "model",
 ):
     """Gets a repo's information from HuggingFace and downloads it locally."""
 
diff --git a/common/gen_logging.py b/common/gen_logging.py
index fcd3c01d..6cf5ddcb 100644
--- a/common/gen_logging.py
+++ b/common/gen_logging.py
@@ -3,7 +3,6 @@
 """
 
 from loguru import logger
-from typing import Optional
 
 from common.tabby_config import config
 
@@ -29,7 +28,7 @@ def log_generation_params(**kwargs):
         logger.info(f"Generation options: {kwargs}\n")
 
 
-def log_prompt(prompt: str, request_id: str, negative_prompt: Optional[str] = None):
+def log_prompt(prompt: str, request_id: str, negative_prompt: str | None = None):
     """Logs the prompt to console."""
     if config.logging.log_prompt:
         formatted_prompt = "\n" + prompt
@@ -55,7 +54,7 @@ def log_response(request_id: str, response: str):
 def log_metrics(
     request_id: str,
     metrics: dict,
-    context_len: Optional[int],
+    context_len: int | None,
     max_seq_len: int,
 ):
     initial_response = (
diff --git a/common/health.py b/common/health.py
index 4d21d6af..a31cb689 100644
--- a/common/health.py
+++ b/common/health.py
@@ -3,7 +3,6 @@
 from datetime import datetime, timezone
 from functools import partial
 from pydantic import BaseModel, Field
-from typing import Union
 
 
 class UnhealthyEvent(BaseModel):
@@ -24,7 +23,7 @@ def __init__(self):
         self.issues: deque[UnhealthyEvent] = deque(maxlen=100)
         self._lock = asyncio.Lock()
 
-    async def add_unhealthy_event(self, error: Union[str, Exception]):
+    async def add_unhealthy_event(self, error: str | Exception):
         """Add a new unhealthy event"""
         async with self._lock:
             if isinstance(error, Exception):
diff --git a/common/model.py b/common/model.py
index 4ac4861a..757688e8 100644
--- a/common/model.py
+++ b/common/model.py
@@ -10,7 +10,6 @@
 from fastapi import HTTPException
 from loguru import logger
 from ruamel.yaml import YAML
-from typing import Dict, Optional
 
 from backends.base_model_container import BaseModelContainer
 from common.logger import get_loading_progress_bar
@@ -21,11 +20,11 @@
 from common.utils import deep_merge_dict, unwrap
 
 # Global variables for model container
-container: Optional[BaseModelContainer] = None
+container: BaseModelContainer | None = None
 embeddings_container = None
 
 
-_BACKEND_REGISTRY: Dict[str, BaseModelContainer] = {}
+_BACKEND_REGISTRY: dict[str, BaseModelContainer] = {}
 
 if dependencies.exllamav2:
     from backends.exllamav2.model import ExllamaV2Container
@@ -42,7 +41,7 @@
 if dependencies.extras:
     from backends.infinity.model import InfinityContainer
 
-    embeddings_container: Optional[InfinityContainer] = None
+    embeddings_container: InfinityContainer | None = None
 
 
 class ModelType(Enum):
diff --git a/common/multimodal.py b/common/multimodal.py
index b92386f3..11e401dc 100644
--- a/common/multimodal.py
+++ b/common/multimodal.py
@@ -3,7 +3,6 @@
 from common import model
 from loguru import logger
 from pydantic import BaseModel, Field
-from typing import List
 
 from common.optional_dependencies import dependencies
 
@@ -18,7 +17,7 @@ class MultimodalEmbeddingWrapper(BaseModel):
 
     type: str = None
     content: list = Field(default_factory=list)
-    text_alias: List[str] = Field(default_factory=list)
+    text_alias: list[str] = Field(default_factory=list)
 
     async def add(self, url: str):
         # Determine the type of vision embedding to use
diff --git a/common/networking.py b/common/networking.py
index 597ed078..f4d72917 100644
--- a/common/networking.py
+++ b/common/networking.py
@@ -7,7 +7,6 @@
 from fastapi import Depends, HTTPException, Request
 from loguru import logger
 from pydantic import BaseModel
-from typing import Optional
 from uuid import uuid4
 
 from common.tabby_config import config
@@ -17,7 +16,7 @@ class TabbyRequestErrorMessage(BaseModel):
     """Common request error type."""
 
     message: str
-    trace: Optional[str] = None
+    trace: str | None = None
 
 
 class TabbyRequestError(BaseModel):
diff --git a/common/sampling.py b/common/sampling.py
index 49be5b99..832661ed 100644
--- a/common/sampling.py
+++ b/common/sampling.py
@@ -14,7 +14,6 @@
     field_validator,
     model_validator,
 )
-from typing import Dict, List, Optional, Union
 
 from common.utils import filter_none_values, unwrap
 
@@ -23,7 +22,7 @@
 class BaseSamplerRequest(BaseModel):
     """Common class for sampler params that are used in APIs"""
 
-    max_tokens: Optional[int] = Field(
+    max_tokens: int | None = Field(
         default_factory=lambda: get_default_sampler_value("max_tokens"),
         validation_alias=AliasChoices(
             "max_tokens", "max_completion_tokens", "max_length"
@@ -33,7 +32,7 @@ class BaseSamplerRequest(BaseModel):
         ge=0,
     )
 
-    min_tokens: Optional[int] = Field(
+    min_tokens: int | None = Field(
         default_factory=lambda: get_default_sampler_value("min_tokens", 0),
         validation_alias=AliasChoices("min_tokens", "min_length"),
         description="Aliases: min_length",
@@ -41,76 +40,76 @@ class BaseSamplerRequest(BaseModel):
         ge=0,
     )
 
-    stop: Optional[Union[str, List[Union[str, int]]]] = Field(
+    stop: str | list[str | int] | None = Field(
         default_factory=lambda: get_default_sampler_value("stop", []),
         validation_alias=AliasChoices("stop", "stop_sequence"),
         description="Aliases: stop_sequence",
     )
 
-    banned_strings: Optional[Union[str, List[str]]] = Field(
+    banned_strings: str | list[str] | None = Field(
         default_factory=lambda: get_default_sampler_value("banned_strings", [])
     )
 
-    banned_tokens: Optional[Union[List[int], str]] = Field(
+    banned_tokens: list[int] | str | None = Field(
         default_factory=lambda: get_default_sampler_value("banned_tokens", []),
         validation_alias=AliasChoices("banned_tokens", "custom_token_bans"),
         description="Aliases: custom_token_bans",
         examples=[[128, 330]],
     )
 
-    allowed_tokens: Optional[Union[List[int], str]] = Field(
+    allowed_tokens: list[int] | str | None = Field(
         default_factory=lambda: get_default_sampler_value("allowed_tokens", []),
         validation_alias=AliasChoices("allowed_tokens", "allowed_token_ids"),
         description="Aliases: allowed_token_ids",
         examples=[[128, 330]],
     )
 
-    token_healing: Optional[bool] = Field(
+    token_healing: bool | None = Field(
         default_factory=lambda: get_default_sampler_value("token_healing", False)
     )
 
-    temperature: Optional[float] = Field(
+    temperature: float | None = Field(
         default_factory=lambda: get_default_sampler_value("temperature", 1.0),
         examples=[1.0],
         ge=0,
         le=10,
     )
 
-    temperature_last: Optional[bool] = Field(
+    temperature_last: bool | None = Field(
         default_factory=lambda: get_default_sampler_value("temperature_last", False),
     )
 
-    smoothing_factor: Optional[float] = Field(
+    smoothing_factor: float | None = Field(
         default_factory=lambda: get_default_sampler_value("smoothing_factor", 0.0),
         ge=0,
     )
 
-    top_k: Optional[int] = Field(
+    top_k: int | None = Field(
         default_factory=lambda: get_default_sampler_value("top_k", 0),
         ge=-1,
     )
 
-    top_p: Optional[float] = Field(
+    top_p: float | None = Field(
         default_factory=lambda: get_default_sampler_value("top_p", 1.0),
         ge=0,
         le=1,
         examples=[1.0],
     )
 
-    top_a: Optional[float] = Field(
+    top_a: float | None = Field(
         default_factory=lambda: get_default_sampler_value("top_a", 0.0)
     )
 
-    min_p: Optional[float] = Field(
+    min_p: float | None = Field(
         default_factory=lambda: get_default_sampler_value("min_p", 0.0)
     )
 
-    tfs: Optional[float] = Field(
+    tfs: float | None = Field(
         default_factory=lambda: get_default_sampler_value("tfs", 1.0),
         examples=[1.0],
     )
 
-    typical: Optional[float] = Field(
+    typical: float | None = Field(
         default_factory=lambda: get_default_sampler_value("typical", 1.0),
         validation_alias=AliasChoices("typical", "typical_p"),
         description="Aliases: typical_p",
@@ -119,30 +118,30 @@ class BaseSamplerRequest(BaseModel):
         le=1,
     )
 
-    skew: Optional[float] = Field(
+    skew: float | None = Field(
         default_factory=lambda: get_default_sampler_value("skew", 0.0),
         examples=[0.0],
     )
 
-    xtc_probability: Optional[float] = Field(
+    xtc_probability: float | None = Field(
         default_factory=lambda: get_default_sampler_value("xtc_probability", 0.0),
     )
 
-    xtc_threshold: Optional[float] = Field(
+    xtc_threshold: float | None = Field(
         default_factory=lambda: get_default_sampler_value("xtc_threshold", 0.1)
     )
 
-    frequency_penalty: Optional[float] = Field(
+    frequency_penalty: float | None = Field(
         default_factory=lambda: get_default_sampler_value("frequency_penalty", 0.0),
         ge=0,
     )
 
-    presence_penalty: Optional[float] = Field(
+    presence_penalty: float | None = Field(
         default_factory=lambda: get_default_sampler_value("presence_penalty", 0.0),
         ge=0,
     )
 
-    repetition_penalty: Optional[float] = Field(
+    repetition_penalty: float | None = Field(
         default_factory=lambda: get_default_sampler_value("repetition_penalty", 1.0),
         validation_alias=AliasChoices("repetition_penalty", "rep_pen"),
         description="Aliases: rep_pen",
@@ -150,7 +149,7 @@ class BaseSamplerRequest(BaseModel):
         gt=0,
     )
 
-    penalty_range: Optional[int] = Field(
+    penalty_range: int | None = Field(
         default_factory=lambda: get_default_sampler_value("penalty_range", -1),
         validation_alias=AliasChoices(
             "penalty_range",
@@ -163,91 +162,91 @@ class BaseSamplerRequest(BaseModel):
         ),
     )
 
-    repetition_decay: Optional[int] = Field(
+    repetition_decay: int | None = Field(
         default_factory=lambda: get_default_sampler_value("repetition_decay", 0)
     )
 
-    dry_multiplier: Optional[float] = Field(
+    dry_multiplier: float | None = Field(
         default_factory=lambda: get_default_sampler_value("dry_multiplier", 0.0)
     )
 
-    dry_base: Optional[float] = Field(
+    dry_base: float | None = Field(
         default_factory=lambda: get_default_sampler_value("dry_base", 0.0)
     )
 
-    dry_allowed_length: Optional[int] = Field(
+    dry_allowed_length: int | None = Field(
         default_factory=lambda: get_default_sampler_value("dry_allowed_length", 0)
     )
 
-    dry_range: Optional[int] = Field(
+    dry_range: int | None = Field(
         default_factory=lambda: get_default_sampler_value("dry_range", 0),
         validation_alias=AliasChoices("dry_range", "dry_penalty_last_n"),
         description=("Aliases: dry_penalty_last_n"),
     )
 
-    dry_sequence_breakers: Optional[Union[str, List[str]]] = Field(
+    dry_sequence_breakers: str | list[str] | None = Field(
         default_factory=lambda: get_default_sampler_value("dry_sequence_breakers", [])
     )
 
-    mirostat_mode: Optional[int] = Field(
+    mirostat_mode: int | None = Field(
         default_factory=lambda: get_default_sampler_value("mirostat_mode", 0),
         alias=AliasChoices("mirostat_mode", "mirostat"),
     )
 
-    mirostat_tau: Optional[float] = Field(
+    mirostat_tau: float | None = Field(
         default_factory=lambda: get_default_sampler_value("mirostat_tau", 1.5),
         examples=[1.5],
     )
 
-    mirostat_eta: Optional[float] = Field(
+    mirostat_eta: float | None = Field(
         default_factory=lambda: get_default_sampler_value("mirostat_eta", 0.3),
         examples=[0.3],
     )
 
-    add_bos_token: Optional[bool] = Field(
+    add_bos_token: bool | None = Field(
         default_factory=lambda: get_default_sampler_value("add_bos_token")
     )
 
-    ban_eos_token: Optional[bool] = Field(
+    ban_eos_token: bool | None = Field(
         default_factory=lambda: get_default_sampler_value("ban_eos_token", False),
         validation_alias=AliasChoices("ban_eos_token", "ignore_eos"),
         description="Aliases: ignore_eos",
         examples=[False],
     )
 
-    logit_bias: Optional[Dict[int, float]] = Field(
+    logit_bias: dict[int, float] | None = Field(
         default_factory=lambda: get_default_sampler_value("logit_bias"),
         examples=[{"1": 10, "2": 50}],
     )
 
-    negative_prompt: Optional[str] = Field(
+    negative_prompt: str | None = Field(
         default_factory=lambda: get_default_sampler_value("negative_prompt")
     )
 
-    json_schema: Optional[object] = Field(
+    json_schema: object | None = Field(
         default_factory=lambda: get_default_sampler_value("json_schema"),
     )
 
-    regex_pattern: Optional[str] = Field(
+    regex_pattern: str | None = Field(
         default_factory=lambda: get_default_sampler_value("regex_pattern"),
     )
 
-    grammar_string: Optional[str] = Field(
+    grammar_string: str | None = Field(
         default_factory=lambda: get_default_sampler_value("grammar_string"),
     )
 
-    speculative_ngram: Optional[bool] = Field(
+    speculative_ngram: bool | None = Field(
         default_factory=lambda: get_default_sampler_value("speculative_ngram"),
     )
 
-    cfg_scale: Optional[float] = Field(
+    cfg_scale: float | None = Field(
         default_factory=lambda: get_default_sampler_value("cfg_scale", 1.0),
         validation_alias=AliasChoices("cfg_scale", "guidance_scale"),
         description="Aliases: guidance_scale",
         examples=[1.0],
     )
 
-    max_temp: Optional[float] = Field(
+    max_temp: float | None = Field(
         default_factory=lambda: get_default_sampler_value("max_temp", 1.0),
         validation_alias=AliasChoices("max_temp", "dynatemp_high"),
         description="Aliases: dynatemp_high",
@@ -255,7 +254,7 @@ class BaseSamplerRequest(BaseModel):
         ge=0,
     )
 
-    min_temp: Optional[float] = Field(
+    min_temp: float | None = Field(
         default_factory=lambda: get_default_sampler_value("min_temp", 1.0),
         validation_alias=AliasChoices("min_temp", "dynatemp_low"),
         description="Aliases: dynatemp_low",
@@ -263,14 +262,14 @@ class BaseSamplerRequest(BaseModel):
         ge=0,
     )
 
-    temp_exponent: Optional[float] = Field(
+    temp_exponent: float | None = Field(
         default_factory=lambda: get_default_sampler_value("temp_exponent", 1.0),
         validation_alias=AliasChoices("temp_exponent", "dynatemp_exponent"),
         examples=[1.0],
         ge=0,
     )
 
-    logprobs: Optional[int] = Field(
+    logprobs: int | None = Field(
         default_factory=lambda: get_default_sampler_value("logprobs", 0),
         ge=0,
     )
@@ -335,7 +334,7 @@ def after_validate(self):
 
 
 class SamplerOverridesContainer(BaseModel):
-    selected_preset: Optional[str] = None
+    selected_preset: str | None = None
     overrides: dict = {}
 
 
diff --git a/common/tabby_config.py b/common/tabby_config.py
index 9c4cc5d6..212746f1 100644
--- a/common/tabby_config.py
+++ b/common/tabby_config.py
@@ -2,7 +2,6 @@
 from inspect import getdoc
 from os import getenv
 from textwrap import dedent
-from typing import Optional
 
 from loguru import logger
 from pydantic import BaseModel
@@ -22,7 +21,7 @@ class TabbyConfig(TabbyConfigModel):
     model_defaults: dict = {}
     draft_model_defaults: dict = {}
 
-    def load(self, arguments: Optional[dict] = None):
+    def load(self, arguments: dict | None = None):
         """Synchronously loads the global application config"""
 
         # config is applied in order of items in the list
@@ -45,7 +44,7 @@ def load(self, arguments: Optional[dict] = None):
             value = getattr(merged_config_model, field)
             setattr(self, field, value)
 
-        # Set model defaults dict once to prevent on-demand reconstruction
+        # set model defaults dict once to prevent on-demand reconstruction
         # TODO: clean this up a bit
         for field in self.model.use_as_default:
             if hasattr(self.model, field):
diff --git a/common/templating.py b/common/templating.py
index cc0cceb1..864ee337 100644
--- a/common/templating.py
+++ b/common/templating.py
@@ -7,7 +7,6 @@
 from dataclasses import dataclass, field
 from datetime import datetime
 from importlib.metadata import version as package_version
-from typing import List, Optional
 from jinja2 import Template, TemplateError
 from jinja2.ext import loopcontrols
 from jinja2.sandbox import ImmutableSandboxedEnvironment
@@ -28,8 +27,8 @@ class TemplateLoadError(Exception):
 class TemplateMetadata:
     """Represents the parsed metadata from a template."""
 
-    stop_strings: List[str] = field(default_factory=list)
-    tool_start: Optional[str] = None
+    stop_strings: list[str] = field(default_factory=list)
+    tool_start: str | None = None
 
 
 class PromptTemplate:
@@ -44,7 +43,7 @@ class PromptTemplate:
         enable_async=True,
         extensions=[loopcontrols],
     )
-    metadata: Optional[TemplateMetadata] = None
+    metadata: TemplateMetadata | None = None
 
     async def extract_metadata(self, template_vars: dict):
         """
@@ -145,7 +144,7 @@ async def from_file(cls, template_path: pathlib.Path):
 
     @classmethod
     async def from_model_json(
-        cls, json_path: pathlib.Path, key: str, name: Optional[str] = None
+        cls, json_path: pathlib.Path, key: str, name: str | None = None
     ):
         """Get a template from a JSON file. Requires a key and template name"""
         if not json_path.exists():
diff --git a/common/transformers_utils.py b/common/transformers_utils.py
index a7b0f0c1..92441564 100644
--- a/common/transformers_utils.py
+++ b/common/transformers_utils.py
@@ -3,7 +3,6 @@
 import pathlib
 from loguru import logger
 from pydantic import BaseModel
-from typing import Dict, List, Optional, Set, Union
 
 
 class GenerationConfig(BaseModel):
@@ -12,7 +11,7 @@ class GenerationConfig(BaseModel):
     Will be expanded as needed.
     """
 
-    eos_token_id: Optional[Union[int, List[int]]] = None
+    eos_token_id: int | list[int] | None = None
 
     @classmethod
     async def from_directory(cls, model_directory: pathlib.Path):
@@ -44,8 +43,8 @@ class HuggingFaceConfig(BaseModel):
     """
 
     max_position_embeddings: int = 4096
-    eos_token_id: Optional[Union[int, List[int]]] = None
-    quantization_config: Optional[Dict] = None
+    eos_token_id: int | list[int] | None = None
+    quantization_config: dict | None = None
 
     @classmethod
     async def from_directory(cls, model_directory: pathlib.Path):
@@ -62,7 +61,7 @@ async def from_directory(cls, model_directory: pathlib.Path):
     def quant_method(self):
         """Wrapper method to fetch quant type"""
 
-        if isinstance(self.quantization_config, Dict):
+        if isinstance(self.quantization_config, dict):
             return self.quantization_config.get("quant_method")
         else:
             return None
@@ -83,7 +82,7 @@ class TokenizerConfig(BaseModel):
     An abridged version of HuggingFace's tokenizer config.
     """
 
-    add_bos_token: Optional[bool] = True
+    add_bos_token: bool | None = True
 
     @classmethod
     async def from_directory(cls, model_directory: pathlib.Path):
@@ -111,8 +110,8 @@ class HFModel:
     """
 
     hf_config: HuggingFaceConfig
-    tokenizer_config: Optional[TokenizerConfig] = None
-    generation_config: Optional[GenerationConfig] = None
+    tokenizer_config: TokenizerConfig | None = None
+    generation_config: GenerationConfig | None = None
 
     @classmethod
     async def from_directory(cls, model_directory: pathlib.Path):
@@ -156,7 +155,7 @@ def quant_method(self):
     def eos_tokens(self):
         """Combines and returns EOS tokens from various configs"""
 
-        eos_ids: Set[int] = set()
+        eos_ids: set[int] = set()
 
         eos_ids.update(self.hf_config.eos_tokens())
 
diff --git a/common/utils.py b/common/utils.py
index b0d7ad24..da841799 100644
--- a/common/utils.py
+++ b/common/utils.py
@@ -1,12 +1,12 @@
 """Common utility functions"""
 
-from types import NoneType
-from typing import Dict, Optional, Type, Union, get_args, get_origin, TypeVar
+from types import NoneType, UnionType
+from typing import Type, get_args, get_origin, TypeVar
 
 T = TypeVar("T")
 
 
-def unwrap(wrapped: Optional[T], default: T = None) -> T:
+def unwrap(wrapped: T | None, default: T = None) -> T:
     """Unwrap function for Optionals."""
     if wrapped is None:
         return default
@@ -19,7 +19,7 @@ def coalesce(*args):
     return next((arg for arg in args if arg is not None), None)
 
 
-def filter_none_values(collection: Union[dict, list]) -> Union[dict, list]:
+def filter_none_values(collection: dict | list) -> dict | list:
     """Remove None values from a collection."""
 
     if isinstance(collection, dict):
@@ -32,7 +32,7 @@ def filter_none_values(collection: Union[dict, list]) -> Union[dict, list]:
         return collection
 
 
-def deep_merge_dict(dict1: Dict, dict2: Dict, copy: bool = False) -> Dict:
+def deep_merge_dict(dict1: dict, dict2: dict, copy: bool = False) -> dict:
     """
     Merge 2 dictionaries. If copy is true, the original dictionary isn't modified.
     """
@@ -49,7 +49,7 @@ def deep_merge_dict(dict1: Dict, dict2: Dict, copy: bool = False) -> Dict:
     return dict1
 
 
-def deep_merge_dicts(*dicts: Dict) -> Dict:
+def deep_merge_dicts(*dicts: dict) -> dict:
     """
     Merge an arbitrary amount of dictionaries.
     We wanna do in-place modification for each level, so do not copy.
@@ -84,11 +84,13 @@ def is_list_type(type_hint) -> bool:
 
 def unwrap_optional_type(type_hint) -> Type:
     """
-    Unwrap Optional[type] annotations.
+    Unwrap type | None annotations to extract the base type.
     This is not the same as unwrap.
     """
 
-    if get_origin(type_hint) is Union:
+    origin = get_origin(type_hint)
+
+    if origin is UnionType:
         args = get_args(type_hint)
         if NoneType in args:
             for arg in args:
diff --git a/config_sample.yml b/config_sample.yml
index 0b65f9e8..88fa308a 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -79,7 +79,7 @@ model:
   backend:
 
   # Max sequence length (default: min(max_position_embeddings, cache_size)).
-  # Set to -1 to fetch from the model's config.json
+  # set to -1 to fetch from the model's config.json
   max_seq_len:
 
   # Size of the key/value cache to allocate, in tokens (default: 4096).
@@ -126,7 +126,7 @@ model:
   rope_scale: 1.0
 
   # Rope alpha (default: None).
-  # Same as alpha_value. Set to "auto" to auto-calculate.
+  # Same as alpha_value. set to "auto" to auto-calculate.
   # Leaving this value blank will either pull from the model or auto-calculate.
   rope_alpha:
 
@@ -141,12 +141,12 @@ model:
   # Used by EXL3 models only.
   output_chunking: true
 
-  # Set the maximum number of prompts to process at one time (default: None/Automatic).
+  # set the maximum number of prompts to process at one time (default: None/Automatic).
   # Automatically calculated if left blank.
   # NOTE: Only available for Nvidia ampere (30 series) and above GPUs.
   max_batch_size:
 
-  # Set the prompt template for this model. (default: None)
+  # set the prompt template for this model. (default: None)
   # If empty, attempts to look for the model's chat template.
   # If a model contains multiple templates in its tokenizer_config.json,
   # set prompt_template to the name of the template you want to use.
@@ -172,7 +172,7 @@ draft_model:
   draft_rope_scale: 1.0
 
   # Rope alpha for draft models (default: None).
-  # Same as alpha_value. Set to "auto" to auto-calculate.
+  # Same as alpha_value. set to "auto" to auto-calculate.
   # Leaving this value blank will either pull from the model or auto-calculate.
   draft_rope_alpha:
 
@@ -199,7 +199,7 @@ lora:
   # Directory to look for LoRAs (default: loras).
   lora_dir: loras
 
-  # List of LoRAs to load and associated scaling factors (default scale: 1.0).
+  # list of LoRAs to load and associated scaling factors (default scale: 1.0).
   # For the YAML file, add each entry as a YAML list:
   # - name: lora1
   #   scaling: 1.0
@@ -230,7 +230,7 @@ developer:
   # Disable API request streaming (default: False).
   disable_request_streaming: false
 
-  # Set process to use a higher priority.
+  # set process to use a higher priority.
   # For realtime process priority, run as administrator or sudo.
   # Otherwise, the priority will be set to high.
   realtime_process_priority: false
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 58aa61f5..ba038cec 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,7 +20,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip
 
-# Set the working directory in the container
+# set the working directory in the container
 WORKDIR /app
 
 # Get requirements
@@ -37,7 +37,7 @@ COPY . .
 # Make port 5000 available to the world outside this container
 EXPOSE 5000
 
-# Set the entry point
+# set the entry point
 ENTRYPOINT ["python3"]
 
 # Run main.py when the container launches
diff --git a/docs/01.-Getting-Started.md b/docs/01.-Getting-Started.md
index 330116c6..7e74abd9 100644
--- a/docs/01.-Getting-Started.md
+++ b/docs/01.-Getting-Started.md
@@ -171,7 +171,7 @@ These are short-form instructions for other methods that users can use to instal
     2. For Windows: [Cuda Toolkit on WSL](https://docs.nvidia.com/cuda/wsl-user-guide/index.html)
 3. Clone TabbyAPI via `git clone https://github.com/theroyallab/tabbyAPI`
 4. Enter the tabbyAPI directory by `cd tabbyAPI`.
-	1. Optional: Set up a config.yml or api_tokens.yml ([configuration](#configuration))
+	1. Optional: set up a config.yml or api_tokens.yml ([configuration](#configuration))
 5. Update the volume mount section in the `docker/docker-compose.yml` file
 ```yml
 volumes:
diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md
index 98cee556..03f866ad 100644
--- a/docs/02.-Server-options.md
+++ b/docs/02.-Server-options.md
@@ -16,12 +16,12 @@ All of these options have descriptive comments above them. You should not need t
 
 | Config Option          | Type (Default)         | Description                                                                                                             |
 | ---------------------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- |
-| host                   | String (127.0.0.1)     | Set the IP address used for hosting TabbyAPI                                                                            |
-| port                   | Int (5000)             | Set the TCP Port use for TabbyAPI                                                                                       |
+| host                   | String (127.0.0.1)     | set the IP address used for hosting TabbyAPI                                                                            |
+| port                   | Int (5000)             | set the TCP Port use for TabbyAPI                                                                                       |
 | disable_auth           | Bool (False)           | Disables API authentication                                                                                             |
 | disable_fetch_requests | Bool (False)           | Disables fetching external content when responding to requests (ex. fetching images from URLs)                          |
 | send_tracebacks        | Bool (False)           | Send server tracebacks to client.<br><br>Note: It's not recommended to enable this if sharing the instance with others. |
-| api_servers            | List[String] (["OAI"]) | API servers to enable. Possible values `"OAI", "Kobold"`                                                                |
+| api_servers            | list[String] (["OAI"]) | API servers to enable. Possible values `"OAI", "Kobold"`                                                                |
 
 ### Logging Options
 
@@ -47,7 +47,7 @@ Note: These are experimental flags that may be removed at any point.
 | ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
 | unsafe_launch             | Bool (False)   | Skips dependency checks on startup. Only recommended for debugging.                                                                             |
 | disable_request_streaming | Bool (False)   | Forcefully disables streaming requests                                                                                                          |
-| realtime_process_priority | Bool (False)   | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. |
+| realtime_process_priority | Bool (False)   | set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. |
 
 ### Model Options
 
@@ -58,14 +58,14 @@ Note: Most of the options here will only apply on initial model load/startup (ep
 | model_dir             | String ("models")                | Directory to look for models.<br><br>Note: Persisted across subsequent load requests                                                                                                                                           |
 | inline_model_loading  | Bool (False)                     | Enables ability to switch models using the `model` argument in a generation request. More info in [Usage](https://github.com/theroyallab/tabbyAPI/wiki/03.-Usage#inline-loading)                                               |
 | use_dummy_models      | Bool (False)                     | Send a dummy OAI model card when calling the `/v1/models` endpoint. Used for clients which enforce specific OAI models.<br><br>Note: Persisted across subsequent load requests                                                 |
-| dummy_model_names     | List[String] (["gpt-3.5-turbo"]) | List of dummy names to send on model endpoint requests                                                                                                                                                                         |
+| dummy_model_names     | list[String] (["gpt-3.5-turbo"]) | list of dummy names to send on model endpoint requests                                                                                                                                                                         |
 | model_name            | String (None)                    | Folder name of a model to load. The below parameters will not apply unless this is filled out.                                                                                                                                 |
-| use_as_default        | List[String] ([])                | Keys to use by default when loading models. For example, putting `cache_mode` in this array will make every model load with that value unless specified by the API request.<br><br>Note: Also applies to the `draft` sub-block |
+| use_as_default        | list[String] ([])                | Keys to use by default when loading models. For example, putting `cache_mode` in this array will make every model load with that value unless specified by the API request.<br><br>Note: Also applies to the `draft` sub-block |
 | max_seq_len           | Float (None)                     | Maximum sequence length of the model. Uses the value from config.json if not specified here. Also called the max context length.                                                                                               |
 | tensor_parallel       | Bool (False)                     | Enables tensor parallelism. Automatically falls back to autosplit if GPU split isn't provided. <br><br>Note: `gpu_split_auto` is ignored when this is enabled.                                                                 |
 | gpu_split_auto        | Bool (True)                      | Automatically split the model across multiple GPUs. Manual GPU split isn't used if this is enabled.                                                                                                                            |
-| autosplit_reserve     | List[Int] ([96])                 | Amount of empty VRAM to reserve when loading with autosplit.<br><br>Represented as an array of MB per GPU used.                                                                                                                |
-| gpu_split             | List[Float] ([])                 | Float array of GBs to split a model between GPUs.                                                                                                                                                                              |
+| autosplit_reserve     | list[Int] ([96])                 | Amount of empty VRAM to reserve when loading with autosplit.<br><br>Represented as an array of MB per GPU used.                                                                                                                |
+| gpu_split             | list[Float] ([])                 | Float array of GBs to split a model between GPUs.                                                                                                                                                                              |
 | rope_scale            | Float (1.0)                      | Adjustment for rope scale (or compress_pos_emb)<br><br>Note: If the model has YaRN support, this option will not apply.                                                                                                                                                                                |
 | rope_alpha            | Float (None)                     | Adjustment for rope alpha. Leave blank to automatically calculate based on the max_seq_len.<br><br>Note: If the model has YaRN support, this option will not apply.                                                                                                                                    |
 | cache_mode            | String ("FP16")                  | Cache mode for the model.<br><br>Options: FP16, Q8, Q6, Q4                                                                                                                                                                     |
@@ -86,7 +86,7 @@ Note: Sub-block of Model Options. Same rules apply.
 | draft_rope_scale | Float (1.0)       | String: RoPE scale value for the draft model.                                              |
 | draft_rope_alpha | Float (1.0)       | RoPE alpha value for the draft model. Leave blank for auto-calculation.                    |
 | draft_cache_mode | String ("FP16")   | Cache mode for the draft model.<br><br>Options: FP16, Q8, Q6, Q4                           |
-| draft_gpu_split  | List[Float] ([])  | Float array of GBs to split a draft model between GPUs.                                    |
+| draft_gpu_split  | list[Float] ([])  | Float array of GBs to split a draft model between GPUs.                                    |
 
 ### Lora Options
 
@@ -95,7 +95,7 @@ Note: Sub-block of Mode Options. Same rules apply.
 | Config Option | Type (Default)   | Description                                                  |
 |---------------|------------------|--------------------------------------------------------------|
 | lora_dir      | String ("loras") | Directory to look for loras.<br><br>Note: Persisted across subsequent load requests |
-| loras         | List[loras] ([]) | List of lora objects to apply to the model. Each object contains a name and scaling. |
+| loras         | list[loras] ([]) | list of lora objects to apply to the model. Each object contains a name and scaling. |
 | name          | String (None)    | Folder name of a lora to load.<br><br>Note: An element of the `loras` key |
 | scaling       | Float (1.0)      | "Weight" to apply the lora on the parent model. For example, applying a lora with 0.9 scaling will lower the amount of application on the parent model.<br><br>Note: An element of the `loras` key |
 
diff --git a/docs/06.-Sharing.md b/docs/06.-Sharing.md
index 86288d80..a9f69559 100644
--- a/docs/06.-Sharing.md
+++ b/docs/06.-Sharing.md
@@ -28,7 +28,7 @@ Tailscale is a product that uses the WireGuard protocol to provide a mesh networ
 > This is not a method for exposing your TabbyAPI instance to the world. If you want that, use the other two services.
 
 To get started:
-1. Set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine.
+1. set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine.
 2. Sign up and get started on [Tailscale's website](https://tailscale.com/), then install the client.
 3. Connect to your tailscale account on both your host and client machine.
 4. Select the Tailscale icon (usually in the system tray) and get the IP of your host device. This is usually identified by the hostname.
diff --git a/endpoints/Kobold/types/generation.py b/endpoints/Kobold/types/generation.py
index 5432130b..a58bceba 100644
--- a/endpoints/Kobold/types/generation.py
+++ b/endpoints/Kobold/types/generation.py
@@ -1,6 +1,5 @@
 from functools import partial
 from pydantic import BaseModel, Field, field_validator
-from typing import List, Optional
 
 from common.sampling import BaseSamplerRequest, get_default_sampler_value
 from common.utils import unwrap
@@ -8,9 +7,9 @@
 
 class GenerateRequest(BaseSamplerRequest):
     prompt: str
-    genkey: Optional[str] = None
-    use_default_badwordsids: Optional[bool] = False
-    dynatemp_range: Optional[float] = Field(
+    genkey: str | None = None
+    use_default_badwordsids: bool | None = False
+    dynatemp_range: float | None = Field(
         default_factory=partial(get_default_sampler_value, "dynatemp_range")
     )
 
@@ -43,7 +42,7 @@ class GenerateResponseResult(BaseModel):
 
 
 class GenerateResponse(BaseModel):
-    results: List[GenerateResponseResult] = Field(default_factory=list)
+    results: list[GenerateResponseResult] = Field(default_factory=list)
 
 
 class StreamGenerateChunk(BaseModel):
diff --git a/endpoints/Kobold/types/token.py b/endpoints/Kobold/types/token.py
index e6639d94..3f5a9fe6 100644
--- a/endpoints/Kobold/types/token.py
+++ b/endpoints/Kobold/types/token.py
@@ -1,5 +1,4 @@
 from pydantic import BaseModel
-from typing import List
 
 
 class TokenCountRequest(BaseModel):
@@ -12,4 +11,4 @@ class TokenCountResponse(BaseModel):
     """Represents a KAI tokenization response."""
 
     value: int
-    ids: List[int]
+    ids: list[int]
diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py
index 8f4e7a4e..bb64a48a 100644
--- a/endpoints/OAI/router.py
+++ b/endpoints/OAI/router.py
@@ -72,7 +72,7 @@ async def completion_request(
 
     disable_request_streaming = config.developer.disable_request_streaming
 
-    # Set an empty JSON schema if the request wants a JSON response
+    # set an empty JSON schema if the request wants a JSON response
     if data.response_format.type == "json":
         data.json_schema = {"type": "object"}
 
@@ -125,7 +125,7 @@ async def chat_completion_request(
 
     prompt, embeddings = await apply_chat_template(data)
 
-    # Set an empty JSON schema if the request wants a JSON response
+    # set an empty JSON schema if the request wants a JSON response
     if data.response_format.type == "json":
         data.json_schema = {"type": "object"}
 
diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py
index 52523149..86118463 100644
--- a/endpoints/OAI/types/chat_completion.py
+++ b/endpoints/OAI/types/chat_completion.py
@@ -1,6 +1,8 @@
+from __future__ import annotations
+
 from pydantic import AliasChoices, BaseModel, Field, field_validator
 from time import time
-from typing import Literal, Union, List, Optional, Dict
+from typing import Literal
 from uuid import uuid4
 
 from endpoints.OAI.types.common import UsageStats, CommonCompletionRequest
@@ -10,11 +12,11 @@
 class ChatCompletionLogprob(BaseModel):
     token: str
     logprob: float
-    top_logprobs: Optional[List["ChatCompletionLogprob"]] = Field(default_factory=list)
+    top_logprobs: list[ChatCompletionLogprob] | None = Field(default_factory=list)
 
 
 class ChatCompletionLogprobs(BaseModel):
-    content: List[ChatCompletionLogprob] = Field(default_factory=list)
+    content: list[ChatCompletionLogprob] = Field(default_factory=list)
 
 
 class ChatCompletionImageUrl(BaseModel):
@@ -23,58 +25,58 @@ class ChatCompletionImageUrl(BaseModel):
 
 class ChatCompletionMessagePart(BaseModel):
     type: Literal["text", "image_url"] = "text"
-    text: Optional[str] = None
-    image_url: Optional[ChatCompletionImageUrl] = None
+    text: str | None = None
+    image_url: ChatCompletionImageUrl | None = None
 
 
 class ChatCompletionMessage(BaseModel):
     role: str = "user"
-    content: Optional[Union[str, List[ChatCompletionMessagePart]]] = None
-    tool_calls: Optional[List[ToolCall]] = None
-    tool_call_id: Optional[str] = None
+    content: str | list[ChatCompletionMessagePart] | None = None
+    tool_calls: list[ToolCall] | None = None
+    tool_call_id: str | None = None
 
 
 class ChatCompletionRespChoice(BaseModel):
     # Index is 0 since we aren't using multiple choices
     index: int = 0
-    finish_reason: Optional[str] = None
+    finish_reason: str | None = None
 
     # let's us understand why it stopped and if we need to generate a tool_call
-    stop_str: Optional[str] = None
+    stop_str: str | None = None
     message: ChatCompletionMessage
-    logprobs: Optional[ChatCompletionLogprobs] = None
+    logprobs: ChatCompletionLogprobs | None = None
 
 
 class ChatCompletionStreamChoice(BaseModel):
     # Index is 0 since we aren't using multiple choices
     index: int = 0
-    finish_reason: Optional[str] = None
-    delta: Union[ChatCompletionMessage, dict] = {}
-    logprobs: Optional[ChatCompletionLogprobs] = None
+    finish_reason: str | None = None
+    delta: ChatCompletionMessage | dict = {}
+    logprobs: ChatCompletionLogprobs | None = None
 
 
 # Inherited from common request
 class ChatCompletionRequest(CommonCompletionRequest):
-    messages: List[ChatCompletionMessage]
-    prompt_template: Optional[str] = None
-    add_generation_prompt: Optional[bool] = True
-    template_vars: Optional[dict] = Field(
+    messages: list[ChatCompletionMessage]
+    prompt_template: str | None = None
+    add_generation_prompt: bool | None = True
+    template_vars: dict | None = Field(
         default={},
         validation_alias=AliasChoices("template_vars", "chat_template_kwargs"),
         description="Aliases: chat_template_kwargs",
     )
-    response_prefix: Optional[str] = None
-    model: Optional[str] = None
+    response_prefix: str | None = None
+    model: str | None = None
 
     # tools is follows the format OAI schema, functions is more flexible
     # both are available in the chat template.
 
-    tools: Optional[List[ToolSpec]] = None
-    functions: Optional[List[Dict]] = None
+    tools: list[ToolSpec] | None = None
+    functions: list[dict] | None = None
 
     # Chat completions requests do not have a BOS token preference. Backend
     # respects the tokenization config for the individual model.
-    add_bos_token: Optional[bool] = None
+    add_bos_token: bool | None = None
 
     @field_validator("add_bos_token", mode="after")
     def force_bos_token(cls, v):
@@ -84,17 +86,17 @@ def force_bos_token(cls, v):
 
 class ChatCompletionResponse(BaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{uuid4().hex}")
-    choices: List[ChatCompletionRespChoice]
+    choices: list[ChatCompletionRespChoice]
     created: int = Field(default_factory=lambda: int(time()))
     model: str
     object: str = "chat.completion"
-    usage: Optional[UsageStats] = None
+    usage: UsageStats | None = None
 
 
 class ChatCompletionStreamChunk(BaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{uuid4().hex}")
-    choices: List[ChatCompletionStreamChoice]
+    choices: list[ChatCompletionStreamChoice]
     created: int = Field(default_factory=lambda: int(time()))
     model: str
     object: str = "chat.completion.chunk"
-    usage: Optional[UsageStats] = None
+    usage: UsageStats | None = None
diff --git a/endpoints/OAI/types/common.py b/endpoints/OAI/types/common.py
index 16ef2edd..fb7a4a82 100644
--- a/endpoints/OAI/types/common.py
+++ b/endpoints/OAI/types/common.py
@@ -1,7 +1,6 @@
 """Common types for OAI."""
 
 from pydantic import BaseModel, Field
-from typing import Optional, Union
 
 from common.sampling import BaseSamplerRequest, get_default_sampler_value
 
@@ -10,13 +9,13 @@ class UsageStats(BaseModel):
     """Represents usage stats."""
 
     prompt_tokens: int
-    prompt_time: Optional[float] = None
-    prompt_tokens_per_sec: Optional[Union[float, str]] = None
+    prompt_time: float | None = None
+    prompt_tokens_per_sec: float | str | None = None
     completion_tokens: int
-    completion_time: Optional[float] = None
-    completion_tokens_per_sec: Optional[Union[float, str]] = None
+    completion_time: float | None = None
+    completion_tokens_per_sec: float | str | None = None
     total_tokens: int
-    total_time: Optional[float] = None
+    total_time: float | None = None
 
 
 class CompletionResponseFormat(BaseModel):
@@ -24,7 +23,7 @@ class CompletionResponseFormat(BaseModel):
 
 
 class ChatCompletionStreamOptions(BaseModel):
-    include_usage: Optional[bool] = False
+    include_usage: bool | None = False
 
 
 class CommonCompletionRequest(BaseSamplerRequest):
@@ -32,29 +31,29 @@ class CommonCompletionRequest(BaseSamplerRequest):
 
     # Model information
     # This parameter is not used, the loaded model is used instead
-    model: Optional[str] = None
+    model: str | None = None
 
     # Generation info (remainder is in BaseSamplerRequest superclass)
-    stream: Optional[bool] = False
-    stream_options: Optional[ChatCompletionStreamOptions] = None
-    response_format: Optional[CompletionResponseFormat] = Field(
+    stream: bool | None = False
+    stream_options: ChatCompletionStreamOptions | None = None
+    response_format: CompletionResponseFormat | None = Field(
         default_factory=CompletionResponseFormat
     )
-    n: Optional[int] = Field(
+    n: int | None = Field(
         default_factory=lambda: get_default_sampler_value("n", 1),
         ge=1,
     )
 
     # Extra OAI request stuff
-    best_of: Optional[int] = Field(
+    best_of: int | None = Field(
         description="Not parsed. Only used for OAI compliance.", default=None
     )
-    echo: Optional[bool] = Field(
+    echo: bool | None = Field(
         description="Not parsed. Only used for OAI compliance.", default=False
     )
-    suffix: Optional[str] = Field(
+    suffix: str | None = Field(
         description="Not parsed. Only used for OAI compliance.", default=None
     )
-    user: Optional[str] = Field(
+    user: str | None = Field(
         description="Not parsed. Only used for OAI compliance.", default=None
     )
diff --git a/endpoints/OAI/types/completion.py b/endpoints/OAI/types/completion.py
index d0a7187e..bb3f8443 100644
--- a/endpoints/OAI/types/completion.py
+++ b/endpoints/OAI/types/completion.py
@@ -2,7 +2,6 @@
 
 from pydantic import BaseModel, Field
 from time import time
-from typing import Dict, List, Optional, Union
 from uuid import uuid4
 
 from endpoints.OAI.types.common import CommonCompletionRequest, UsageStats
@@ -11,10 +10,10 @@
 class CompletionLogProbs(BaseModel):
     """Represents log probabilities for a completion request."""
 
-    text_offset: List[int] = Field(default_factory=list)
-    token_logprobs: List[Optional[float]] = Field(default_factory=list)
-    tokens: List[str] = Field(default_factory=list)
-    top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
+    text_offset: list[int] = Field(default_factory=list)
+    token_logprobs: list[float | None] = Field(default_factory=list)
+    tokens: list[str] = Field(default_factory=list)
+    top_logprobs: list[dict[str, float] | None] = Field(default_factory=list)
 
 
 class CompletionRespChoice(BaseModel):
@@ -22,8 +21,8 @@ class CompletionRespChoice(BaseModel):
 
     # Index is 0 since we aren't using multiple choices
     index: int = 0
-    finish_reason: Optional[str] = None
-    logprobs: Optional[CompletionLogProbs] = None
+    finish_reason: str | None = None
+    logprobs: CompletionLogProbs | None = None
     text: str
 
 
@@ -33,15 +32,15 @@ class CompletionRequest(CommonCompletionRequest):
 
     # Prompt can also contain token ids, but that's out of scope
     # for this project.
-    prompt: Union[str, List[str]]
+    prompt: str | list[str]
 
 
 class CompletionResponse(BaseModel):
     """Represents a completion response."""
 
     id: str = Field(default_factory=lambda: f"cmpl-{uuid4().hex}")
-    choices: List[CompletionRespChoice]
+    choices: list[CompletionRespChoice]
     created: int = Field(default_factory=lambda: int(time()))
     model: str
     object: str = "text_completion"
-    usage: Optional[UsageStats] = None
+    usage: UsageStats | None = None
diff --git a/endpoints/OAI/types/embedding.py b/endpoints/OAI/types/embedding.py
index 41419c4d..7eaefcf9 100644
--- a/endpoints/OAI/types/embedding.py
+++ b/endpoints/OAI/types/embedding.py
@@ -1,23 +1,21 @@
-from typing import List, Optional, Union
-
 from pydantic import BaseModel, Field
 
 
 class UsageInfo(BaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
-    completion_tokens: Optional[int] = 0
+    completion_tokens: int | None = 0
 
 
 class EmbeddingsRequest(BaseModel):
-    input: Union[str, List[str]] = Field(
-        ..., description="List of input texts to generate embeddings for."
+    input: str | list[str] = Field(
+        ..., description="list of input texts to generate embeddings for."
     )
     encoding_format: str = Field(
         "float",
         description="Encoding format for the embeddings. Can be 'float' or 'base64'.",
     )
-    model: Optional[str] = Field(
+    model: str | None = Field(
         None,
         description="Name of the embedding model to use. "
         "If not provided, the default model will be used.",
@@ -26,8 +24,8 @@ class EmbeddingsRequest(BaseModel):
 
 class EmbeddingObject(BaseModel):
     object: str = Field("embedding", description="Type of the object.")
-    embedding: Union[List[float], str] = Field(
-        ..., description="Embedding values as a list of floats."
+    embedding: list[float] | str = Field(
+        ..., description="Embedding values as a List of floats."
     )
     index: int = Field(
         ..., description="Index of the input text corresponding to the embedding."
@@ -36,6 +34,6 @@ class EmbeddingObject(BaseModel):
 
 class EmbeddingsResponse(BaseModel):
     object: str = Field("list", description="Type of the response object.")
-    data: List[EmbeddingObject] = Field(..., description="List of embedding objects.")
+    data: list[EmbeddingObject] = Field(..., description="List of embedding objects.")
     model: str = Field(..., description="Name of the embedding model used.")
     usage: UsageInfo = Field(..., description="Information about token usage.")
diff --git a/endpoints/OAI/types/tools.py b/endpoints/OAI/types/tools.py
index b5b9611f..138fda9d 100644
--- a/endpoints/OAI/types/tools.py
+++ b/endpoints/OAI/types/tools.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, Field
-from typing import Dict, Literal
+from typing import Literal
 from uuid import uuid4
 
 
@@ -8,7 +8,7 @@ class Function(BaseModel):
 
     name: str
     description: str
-    parameters: Dict[str, object]
+    parameters: dict[str, object]
 
 
 class ToolSpec(BaseModel):
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index b559bb2b..819a409c 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -3,7 +3,6 @@
 import asyncio
 import pathlib
 from asyncio import CancelledError
-from typing import List, Optional
 from fastapi import HTTPException, Request
 from jinja2 import TemplateError
 from loguru import logger
@@ -33,7 +32,7 @@
 
 
 def _create_response(
-    request_id: str, generations: List[dict], model_name: Optional[str]
+    request_id: str, generations: list[dict], model_name: str | None
 ):
     """Create a chat completion response from the provided text."""
 
@@ -70,7 +69,7 @@ def _create_response(
 
             logprob_response = ChatCompletionLogprobs(content=collected_token_probs)
 
-        # Set finish reason
+        # set finish reason
         if message.tool_calls:
             finish_reason = "tool_calls"
         else:
@@ -111,8 +110,8 @@ def _create_response(
 
 def _create_stream_chunk(
     request_id: str,
-    generation: Optional[dict] = None,
-    model_name: Optional[str] = None,
+    generation: dict | None = None,
+    model_name: str | None = None,
     is_usage_chunk: bool = False,
 ):
     """Create a chat completion stream chunk from the provided text."""
@@ -212,8 +211,8 @@ async def _append_template_metadata(data: ChatCompletionRequest, template_vars:
 
 
 async def format_messages_with_template(
-    messages: List[ChatCompletionMessage],
-    existing_template_vars: Optional[dict] = None,
+    messages: list[ChatCompletionMessage],
+    existing_template_vars: dict | None = None,
 ):
     """Barebones function to format chat completion messages into a prompt."""
 
@@ -221,7 +220,7 @@ async def format_messages_with_template(
     mm_embeddings = MultimodalEmbeddingWrapper() if model.container.use_vision else None
 
     # Convert all messages to a dictionary representation
-    message_dicts: List[dict] = []
+    message_dicts: list[dict] = []
     for message in messages:
         if isinstance(message.content, list):
             concatenated_content = ""
@@ -317,7 +316,7 @@ async def stream_generate_chat_completion(
     """Generator for the generation process."""
     abort_event = asyncio.Event()
     gen_queue = asyncio.Queue()
-    gen_tasks: List[asyncio.Task] = []
+    gen_tasks: list[asyncio.Task] = []
     tool_start = model.container.prompt_template.metadata.tool_start
     disconnect_task = asyncio.create_task(request_disconnect_loop(request))
 
@@ -414,7 +413,7 @@ async def generate_chat_completion(
     request: Request,
     model_path: pathlib.Path,
 ):
-    gen_tasks: List[asyncio.Task] = []
+    gen_tasks: list[asyncio.Task] = []
     tool_start = model.container.prompt_template.metadata.tool_start
 
     try:
@@ -462,14 +461,14 @@ async def generate_tool_calls(
     prompt: str,
     embeddings: MultimodalEmbeddingWrapper,
     data: ChatCompletionRequest,
-    generations: List[str],
+    generations: list[str],
     request: Request,
 ):
-    gen_tasks: List[asyncio.Task] = []
+    gen_tasks: list[asyncio.Task] = []
     tool_start = model.container.prompt_template.metadata.tool_start
 
     # Tracks which generations asked for a tool call
-    tool_idx: List[int] = []
+    tool_idx: list[int] = []
 
     # Copy to make sure the parent JSON schema doesn't get modified
     tool_data = data.model_copy(deep=True)
diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py
index f66d381d..4de10361 100644
--- a/endpoints/OAI/utils/completion.py
+++ b/endpoints/OAI/utils/completion.py
@@ -9,7 +9,6 @@
 from asyncio import CancelledError
 from fastapi import HTTPException, Request
 from loguru import logger
-from typing import List, Optional, Union
 
 from common import model
 from common.auth import get_key_permission
@@ -39,7 +38,7 @@ def _parse_gen_request_id(n: int, request_id: str, task_idx: int):
 
 
 def _create_response(
-    request_id: str, generations: Union[dict, List[dict]], model_name: str = ""
+    request_id: str, generations: dict | list[dict], model_name: str = ""
 ):
     """Create a completion response from the provided choices."""
 
@@ -47,7 +46,7 @@ def _create_response(
     if not isinstance(generations, list):
         generations = [generations]
 
-    choices: List[CompletionRespChoice] = []
+    choices: list[CompletionRespChoice] = []
     for index, generation in enumerate(generations):
         logprob_response = None
 
@@ -103,7 +102,7 @@ async def _stream_collector(
     prompt: str,
     params: CompletionRequest,
     abort_event: asyncio.Event,
-    mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+    mm_embeddings: MultimodalEmbeddingWrapper | None = None,
 ):
     """Collects a stream and places results in a common queue"""
 
@@ -200,7 +199,7 @@ async def stream_generate_completion(
 
     abort_event = asyncio.Event()
     gen_queue = asyncio.Queue()
-    gen_tasks: List[asyncio.Task] = []
+    gen_tasks: list[asyncio.Task] = []
     disconnect_task = asyncio.create_task(request_disconnect_loop(request))
 
     try:
@@ -261,7 +260,7 @@ async def generate_completion(
 ):
     """Non-streaming generate for completions"""
 
-    gen_tasks: List[asyncio.Task] = []
+    gen_tasks: list[asyncio.Task] = []
 
     try:
         logger.info(f"Received completion request {request.state.id}")
diff --git a/endpoints/OAI/utils/tools.py b/endpoints/OAI/utils/tools.py
index c1ebdedf..66e466fe 100644
--- a/endpoints/OAI/utils/tools.py
+++ b/endpoints/OAI/utils/tools.py
@@ -1,6 +1,5 @@
 import json
 from loguru import logger
-from typing import List
 
 from endpoints.OAI.types.tools import ToolCall
 
@@ -30,7 +29,7 @@
 
 class ToolCallProcessor:
     @staticmethod
-    def from_json(tool_calls_str: str) -> List[ToolCall]:
+    def from_json(tool_calls_str: str) -> list[ToolCall]:
         """Postprocess tool call JSON to a parseable class"""
 
         tool_calls = json.loads(tool_calls_str)
@@ -42,15 +41,15 @@ def from_json(tool_calls_str: str) -> List[ToolCall]:
         return [ToolCall(**tool_call) for tool_call in tool_calls]
 
     @staticmethod
-    def dump(tool_calls: List[ToolCall]) -> List[dict]:
+    def dump(tool_calls: list[ToolCall]) -> list[dict]:
         """
         Convert ToolCall objects to a list of dictionaries.
 
         Args:
-            tool_calls (List[ToolCall]): List of ToolCall objects to convert
+            tool_calls (list[ToolCall]): list of ToolCall objects to convert
 
         Returns:
-            List[dict]: List of dictionaries representing the tool calls
+            list[dict]: list of dictionaries representing the tool calls
         """
 
         # Don't use list comprehension here
@@ -64,12 +63,12 @@ def dump(tool_calls: List[ToolCall]) -> List[dict]:
         return dumped_tool_calls
 
     @staticmethod
-    def to_json(tool_calls: List[ToolCall]) -> str:
+    def to_json(tool_calls: list[ToolCall]) -> str:
         """
         Convert ToolCall objects to JSON string representation.
 
         Args:
-            tool_calls (List[ToolCall]): List of ToolCall objects to convert
+            tool_calls (list[ToolCall]): list of ToolCall objects to convert
 
         Returns:
             str: JSON representation of the tool calls
diff --git a/endpoints/core/router.py b/endpoints/core/router.py
index 800c7d90..3b362a73 100644
--- a/endpoints/core/router.py
+++ b/endpoints/core/router.py
@@ -1,7 +1,6 @@
 import asyncio
 import pathlib
 from sys import maxsize
-from typing import Optional
 from common.multimodal import MultimodalEmbeddingWrapper
 from fastapi import APIRouter, Depends, HTTPException, Request, Response
 from fastapi.responses import JSONResponse
@@ -275,7 +274,7 @@ async def load_lora(data: LoraLoadRequest) -> LoraLoadResponse:
 
     if not data.loras:
         error_message = handle_request_error(
-            "List of loras to load is not found.",
+            "list of loras to load is not found.",
             exc_info=False,
         ).error.message
 
@@ -403,7 +402,7 @@ async def unload_embedding_model():
 async def encode_tokens(data: TokenEncodeRequest) -> TokenEncodeResponse:
     """Encodes a string or chat completion messages into tokens."""
 
-    mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None
+    mm_embeddings: MultimodalEmbeddingWrapper | None = None
 
     if isinstance(data.text, str):
         text = data.text
@@ -547,7 +546,7 @@ async def unload_template():
 @router.get("/v1/sampling/override/list", dependencies=[Depends(check_api_key)])
 async def list_sampler_overrides(request: Request) -> SamplerOverrideListResponse:
     """
-    List all currently applied sampler overrides.
+    list all currently applied sampler overrides.
 
     Requires an admin key to see all override presets.
     """
diff --git a/endpoints/core/types/download.py b/endpoints/core/types/download.py
index cf49501f..c9f8a040 100644
--- a/endpoints/core/types/download.py
+++ b/endpoints/core/types/download.py
@@ -1,5 +1,4 @@
 from pydantic import BaseModel, Field
-from typing import List, Optional
 
 
 def _generate_include_list():
@@ -11,13 +10,13 @@ class DownloadRequest(BaseModel):
 
     repo_id: str
     repo_type: str = "model"
-    folder_name: Optional[str] = None
-    revision: Optional[str] = None
-    token: Optional[str] = None
-    include: List[str] = Field(default_factory=_generate_include_list)
-    exclude: List[str] = Field(default_factory=list)
-    chunk_limit: Optional[int] = None
-    timeout: Optional[int] = None
+    folder_name: str | None = None
+    revision: str | None = None
+    token: str | None = None
+    include: list[str] = Field(default_factory=_generate_include_list)
+    exclude: list[str] = Field(default_factory=list)
+    chunk_limit: int | None = None
+    timeout: int | None = None
 
 
 class DownloadResponse(BaseModel):
diff --git a/endpoints/core/types/health.py b/endpoints/core/types/health.py
index ad5fffef..189fa008 100644
--- a/endpoints/core/types/health.py
+++ b/endpoints/core/types/health.py
@@ -11,5 +11,5 @@ class HealthCheckResponse(BaseModel):
         "healthy", description="System health status"
     )
     issues: list[UnhealthyEvent] = Field(
-        default_factory=list, description="List of issues"
+        default_factory=list, description="list of issues"
     )
diff --git a/endpoints/core/types/lora.py b/endpoints/core/types/lora.py
index 8435a8a4..88e9ba8f 100644
--- a/endpoints/core/types/lora.py
+++ b/endpoints/core/types/lora.py
@@ -2,7 +2,6 @@
 
 from pydantic import BaseModel, Field
 from time import time
-from typing import Optional, List
 
 
 class LoraCard(BaseModel):
@@ -12,32 +11,32 @@ class LoraCard(BaseModel):
     object: str = "lora"
     created: int = Field(default_factory=lambda: int(time()))
     owned_by: str = "tabbyAPI"
-    scaling: Optional[float] = None
+    scaling: float | None = None
 
 
 class LoraList(BaseModel):
     """Represents a list of Lora cards."""
 
     object: str = "list"
-    data: List[LoraCard] = Field(default_factory=list)
+    data: list[LoraCard] = Field(default_factory=list)
 
 
 class LoraLoadInfo(BaseModel):
     """Represents a single Lora load info."""
 
     name: str
-    scaling: Optional[float] = 1.0
+    scaling: float | None = 1.0
 
 
 class LoraLoadRequest(BaseModel):
     """Represents a Lora load request."""
 
-    loras: List[LoraLoadInfo]
+    loras: list[LoraLoadInfo]
     skip_queue: bool = False
 
 
 class LoraLoadResponse(BaseModel):
     """Represents a Lora load response."""
 
-    success: List[str] = Field(default_factory=list)
-    failure: List[str] = Field(default_factory=list)
+    success: list[str] = Field(default_factory=list)
+    failure: list[str] = Field(default_factory=list)
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
index 84229294..0f4252fb 100644
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -1,8 +1,10 @@
 """Contains model card types."""
 
+from __future__ import annotations
+
 from pydantic import BaseModel, Field, ConfigDict
 from time import time
-from typing import List, Literal, Optional, Union
+from typing import Literal
 
 from common.config_models import LoggingConfig
 from common.tabby_config import config
@@ -13,19 +15,19 @@ class ModelCardParameters(BaseModel):
 
     # Safe to do this since it's guaranteed to fetch a max seq len
     # from model_container
-    max_seq_len: Optional[int] = None
-    cache_size: Optional[int] = None
-    cache_mode: Optional[str] = "FP16"
-    rope_scale: Optional[float] = 1.0
-    rope_alpha: Optional[float] = 1.0
-    max_batch_size: Optional[int] = 1
-    chunk_size: Optional[int] = 2048
-    prompt_template: Optional[str] = None
-    prompt_template_content: Optional[str] = None
-    use_vision: Optional[bool] = False
+    max_seq_len: int | None = None
+    cache_size: int | None = None
+    cache_mode: str | None = "FP16"
+    rope_scale: float | None = 1.0
+    rope_alpha: float | None = 1.0
+    max_batch_size: int | None = 1
+    chunk_size: int | None = 2048
+    prompt_template: str | None = None
+    prompt_template_content: str | None = None
+    use_vision: bool | None = False
 
     # Draft is another model, so include it in the card params
-    draft: Optional["ModelCard"] = None
+    draft: ModelCard | None = None
 
 
 class ModelCard(BaseModel):
@@ -35,15 +37,15 @@ class ModelCard(BaseModel):
     object: str = "model"
     created: int = Field(default_factory=lambda: int(time()))
     owned_by: str = "tabbyAPI"
-    logging: Optional[LoggingConfig] = None
-    parameters: Optional[ModelCardParameters] = None
+    logging: LoggingConfig | None = None
+    parameters: ModelCardParameters | None = None
 
 
 class ModelList(BaseModel):
     """Represents a list of model cards."""
 
     object: str = "list"
-    data: List[ModelCard] = Field(default_factory=list)
+    data: list[ModelCard] = Field(default_factory=list)
 
 
 class DraftModelLoadRequest(BaseModel):
@@ -53,13 +55,13 @@ class DraftModelLoadRequest(BaseModel):
     draft_model_name: str
 
     # Config arguments
-    draft_rope_scale: Optional[float] = None
-    draft_rope_alpha: Optional[Union[float, Literal["auto"]]] = Field(
+    draft_rope_scale: float | None = None
+    draft_rope_alpha: float | Literal["auto"] | None = Field(
         description='Automatically calculated if set to "auto"',
         default=None,
         examples=[1.0],
     )
-    draft_gpu_split: Optional[List[float]] = Field(
+    draft_gpu_split: list[float] | None = Field(
         default_factory=list,
         examples=[[24.0, 20.0]],
     )
@@ -75,54 +77,54 @@ class ModelLoadRequest(BaseModel):
     model_name: str
 
     # Config arguments
-    backend: Optional[str] = Field(
+    backend: str | None = Field(
         description="Backend to use",
         default=None,
     )
-    max_seq_len: Optional[int] = Field(
+    max_seq_len: int | None = Field(
         description="Leave this blank to use the model's base sequence length",
         default=None,
         examples=[4096],
     )
-    cache_size: Optional[int] = Field(
+    cache_size: int | None = Field(
         description="Number in tokens, must be multiple of 256",
         default=None,
         examples=[4096],
     )
-    cache_mode: Optional[str] = None
-    tensor_parallel: Optional[bool] = None
-    tensor_parallel_backend: Optional[str] = "native"
-    gpu_split_auto: Optional[bool] = None
-    autosplit_reserve: Optional[List[float]] = None
-    gpu_split: Optional[List[float]] = Field(
+    cache_mode: str | None = None
+    tensor_parallel: bool | None = None
+    tensor_parallel_backend: str | None = "native"
+    gpu_split_auto: bool | None = None
+    autosplit_reserve: list[float] | None = None
+    gpu_split: list[float] | None = Field(
         default_factory=list,
         examples=[[24.0, 20.0]],
     )
-    rope_scale: Optional[float] = Field(
+    rope_scale: float | None = Field(
         description="Automatically pulled from the model's config if not present",
         default=None,
         examples=[1.0],
     )
-    rope_alpha: Optional[Union[float, Literal["auto"]]] = Field(
+    rope_alpha: float | Literal["auto"] | None = Field(
         description='Automatically calculated if set to "auto"',
         default=None,
         examples=[1.0],
     )
-    chunk_size: Optional[int] = None
-    output_chunking: Optional[bool] = True
-    prompt_template: Optional[str] = None
-    vision: Optional[bool] = None
+    chunk_size: int | None = None
+    output_chunking: bool | None = True
+    prompt_template: str | None = None
+    vision: bool | None = None
 
     # Non-config arguments
-    draft_model: Optional[DraftModelLoadRequest] = None
-    skip_queue: Optional[bool] = False
+    draft_model: DraftModelLoadRequest | None = None
+    skip_queue: bool | None = False
 
 
 class EmbeddingModelLoadRequest(BaseModel):
     embedding_model_name: str
 
-    # Set default from the config
-    embeddings_device: Optional[str] = Field(config.embeddings.embeddings_device)
+    # set default from the config
+    embeddings_device: str | None = Field(config.embeddings.embeddings_device)
 
 
 class ModelLoadResponse(BaseModel):
diff --git a/endpoints/core/types/sampler_overrides.py b/endpoints/core/types/sampler_overrides.py
index 18627829..2a2efab0 100644
--- a/endpoints/core/types/sampler_overrides.py
+++ b/endpoints/core/types/sampler_overrides.py
@@ -1,5 +1,4 @@
 from pydantic import BaseModel, Field
-from typing import List, Optional
 
 from common.sampling import SamplerOverridesContainer
 
@@ -7,17 +6,17 @@
 class SamplerOverrideListResponse(SamplerOverridesContainer):
     """Sampler override list response"""
 
-    presets: Optional[List[str]]
+    presets: list[str] | None
 
 
 class SamplerOverrideSwitchRequest(BaseModel):
     """Sampler override switch request"""
 
-    preset: Optional[str] = Field(
+    preset: str | None = Field(
         default=None, description="Pass a sampler override preset name"
     )
 
-    overrides: Optional[dict] = Field(
+    overrides: dict | None = Field(
         default=None,
         description=(
             "Sampling override parent takes in individual keys and overrides. "
diff --git a/endpoints/core/types/template.py b/endpoints/core/types/template.py
index a82ef48d..a3b98fb2 100644
--- a/endpoints/core/types/template.py
+++ b/endpoints/core/types/template.py
@@ -1,12 +1,11 @@
 from pydantic import BaseModel, Field
-from typing import List
 
 
 class TemplateList(BaseModel):
     """Represents a list of templates."""
 
     object: str = "list"
-    data: List[str] = Field(default_factory=list)
+    data: list[str] = Field(default_factory=list)
 
 
 class TemplateSwitchRequest(BaseModel):
diff --git a/endpoints/core/types/token.py b/endpoints/core/types/token.py
index d43e65e4..8a28d880 100644
--- a/endpoints/core/types/token.py
+++ b/endpoints/core/types/token.py
@@ -1,7 +1,6 @@
 """Tokenization types"""
 
 from pydantic import BaseModel
-from typing import List, Union
 
 from endpoints.OAI.types.chat_completion import ChatCompletionMessage
 
@@ -25,20 +24,20 @@ def get_params(self):
 class TokenEncodeRequest(CommonTokenRequest):
     """Represents a tokenization request."""
 
-    text: Union[str, List[ChatCompletionMessage]]
+    text: str | list[ChatCompletionMessage]
 
 
 class TokenEncodeResponse(BaseModel):
     """Represents a tokenization response."""
 
-    tokens: List[int]
+    tokens: list[int]
     length: int
 
 
 class TokenDecodeRequest(CommonTokenRequest):
     """ " Represents a detokenization request."""
 
-    tokens: List[int]
+    tokens: list[int]
 
 
 class TokenDecodeResponse(BaseModel):
diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py
index 20c9433c..fa11186e 100644
--- a/endpoints/core/utils/model.py
+++ b/endpoints/core/utils/model.py
@@ -1,6 +1,5 @@
 import pathlib
 from asyncio import CancelledError
-from typing import Optional
 
 from common import model
 from common.networking import get_generator_error, handle_request_disconnect
@@ -13,7 +12,7 @@
 )
 
 
-def get_model_list(model_path: pathlib.Path, draft_model_path: Optional[str] = None):
+def get_model_list(model_path: pathlib.Path, draft_model_path: str | None = None):
     """Get the list of models from the provided path."""
 
     # Convert the provided draft model path to a pathlib path for
@@ -83,7 +82,7 @@ async def stream_model_load(
     # Get trimmed load data
     load_data = data.model_dump(exclude_none=True)
 
-    # Set the draft model directory
+    # set the draft model directory
     load_data.setdefault("draft_model", {})["draft_model_dir"] = (
         config.draft_model.draft_model_dir
     )
diff --git a/endpoints/server.py b/endpoints/server.py
index c17cbb44..4287f747 100644
--- a/endpoints/server.py
+++ b/endpoints/server.py
@@ -3,7 +3,6 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from loguru import logger
-from typing import Optional
 
 from common.logger import UVICORN_LOG_CONFIG
 from common.networking import get_global_depends
@@ -13,7 +12,7 @@
 from endpoints.core.router import router as CoreRouter
 
 
-def setup_app(host: Optional[str] = None, port: Optional[int] = None):
+def setup_app(host: str | None = None, port: int | None = None):
     """Includes the correct routers for startup"""
 
     app = FastAPI(
diff --git a/main.py b/main.py
index 661613eb..1409c656 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,6 @@
 """The main tabbyAPI module. Contains the FastAPI server and endpoints."""
 
-# Set this env var for cuda malloc async before torch is initalized
+# set this env var for cuda malloc async before torch is initalized
 import os
 
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
@@ -11,7 +11,6 @@
 import platform
 import signal
 from loguru import logger
-from typing import Optional
 
 from common import gen_logging, sampling, model
 from common.args import convert_args_to_dict, init_argparser
@@ -92,7 +91,7 @@ async def entrypoint_async():
 
     gen_logging.broadcast_status()
 
-    # Set sampler parameter overrides if provided
+    # set sampler parameter overrides if provided
     sampling_override_preset = config.sampling.override_preset
     if sampling_override_preset:
         try:
@@ -104,12 +103,12 @@ async def entrypoint_async():
 
 
 def entrypoint(
-    args: Optional[argparse.Namespace] = None,
-    parser: Optional[argparse.ArgumentParser] = None,
+    args: argparse.Namespace | None = None,
+    parser: argparse.ArgumentParser | None = None,
 ):
     setup_logger()
 
-    # Set up signal aborting
+    # set up signal aborting
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGTERM, signal_handler)
 
@@ -118,7 +117,7 @@ def entrypoint(
     else:
         from uvloop import install
 
-    # Set loop event policy
+    # set loop event policy
     install()
 
     # Parse and override config from args
@@ -158,7 +157,7 @@ def entrypoint(
 
         raise SystemExit(install_message)
 
-    # Set the process priority
+    # set the process priority
     if config.developer.realtime_process_priority:
         import psutil
 
diff --git a/start.py b/start.py
index 95bdd366..20c4d489 100644
--- a/start.py
+++ b/start.py
@@ -9,7 +9,6 @@
 import sys
 import traceback
 from shutil import copyfile, which
-from typing import List
 
 # Checks for uv installation
 has_uv = which("uv") is not None
@@ -154,7 +153,7 @@ def migrate_start_options(start_options: dict):
     return migrated
 
 
-def run_pip(command: List[str]):
+def run_pip(command: list[str]):
     if has_uv:
         command.insert(0, "uv")
 
@@ -204,10 +203,10 @@ def run_pip(command: List[str]):
             "Getting things ready..."
         )
 
-    # Set variables that rely on start options
+    # set variables that rely on start options
     first_run = not start_options.get("first_run_done")
 
-    # Set gpu_lib for dependency install
+    # set gpu_lib for dependency install
     if args.gpu_lib:
         print("Overriding GPU lib name from args.")
         gpu_lib = args.gpu_lib

From 22cb816b5ea5a08f00899ea24d535ec44410b1b5 Mon Sep 17 00:00:00 2001
From: AlpinDale <alpindale@gmail.com>
Date: Fri, 31 Oct 2025 20:58:42 +0430
Subject: [PATCH 2/3] fix

---
 backends/exllamav2/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 903b4929..74024f63 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -271,6 +271,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
             self.config.max_seq_len = unwrap(
                 user_max_seq_len, min(hf_model.hf_config.max_position_embeddings, 4096)
             )
+            self.cache_size = self.config.max_seq_len
 
         # set the rope scale
         self.config.scale_pos_emb = unwrap(

From 4b8c210a8823951345e4be0c11bda925932a291a Mon Sep 17 00:00:00 2001
From: AlpinDale <52078762+AlpinDale@users.noreply.github.com>
Date: Fri, 31 Oct 2025 21:24:27 +0430
Subject: [PATCH 3/3] Apply suggestions from code review

---
 backends/exllamav2/model.py            | 34 +++++++++++++-------------
 backends/exllamav3/model.py            | 12 ++++-----
 colab/TabbyAPI_Colab_Example.ipynb     |  6 ++---
 common/auth.py                         |  2 +-
 common/config_models.py                | 12 ++++-----
 common/tabby_config.py                 |  2 +-
 config_sample.yml                      | 14 +++++------
 docker/Dockerfile                      |  4 +--
 docs/01.-Getting-Started.md            |  2 +-
 docs/02.-Server-options.md             | 10 ++++----
 docs/06.-Sharing.md                    |  2 +-
 endpoints/OAI/router.py                |  4 +--
 endpoints/OAI/types/embedding.py       |  4 +--
 endpoints/OAI/utils/chat_completion.py |  2 +-
 endpoints/core/router.py               |  4 +--
 endpoints/core/types/model.py          |  2 +-
 endpoints/core/utils/model.py          |  2 +-
 main.py                                | 10 ++++----
 start.py                               |  4 +--
 19 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 74024f63..5c5c934c 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -129,7 +129,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         # Check if the model arch is compatible with various exl2 features
         self.config.arch_compat_overrides()
 
-        # set vision state and error if vision isn't supported on the current model
+        # Set vision state and error if vision isn't supported on the current model
         self.use_vision = unwrap(kwargs.get("vision"), False)
         if self.use_vision and not self.config.vision_model_type:
             raise ValueError(
@@ -184,12 +184,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         gpu_split = unwrap(kwargs.get("gpu_split"), [])
         gpu_device_list = list(range(0, gpu_count))
 
-        # set GPU split options
+        # Set GPU split options
         if gpu_count == 1:
             self.gpu_split_auto = False
             logger.info("Disabling GPU split because one GPU is in use.")
         else:
-            # set tensor parallel
+            # Set tensor parallel
             if use_tp:
                 self.use_tp = True
 
@@ -232,7 +232,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         # Hardcode max output length to 16
         self.config.max_output_len = 16
 
-        # set max batch size to the config override
+        # Set max batch size to the config override
         self.max_batch_size = unwrap(kwargs.get("max_batch_size"))
 
         # Check whether the user's configuration supports flash/paged attention
@@ -261,7 +261,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         # Grab user-set max seq len
         user_max_seq_len = kwargs.get("max_seq_len")
 
-        # set k/v cache size
+        # Set k/v cache size
         # cache_size is only relevant when paged mode is enabled
         if self.paged:
             user_cache_size = coalesce(kwargs.get("cache_size"), user_max_seq_len, 4096)
@@ -273,7 +273,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
             )
             self.cache_size = self.config.max_seq_len
 
-        # set the rope scale
+        # Set the rope scale
         self.config.scale_pos_emb = unwrap(
             kwargs.get("rope_scale"), self.config.scale_pos_emb
         )
@@ -322,7 +322,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         self.config.max_input_len = chunk_size
         self.config.max_attention_size = chunk_size**2
 
-        # set user-configured draft model values
+        # Set user-configured draft model values
         if self.use_draft_model:
             self.draft_config.max_seq_len = self.config.max_seq_len
 
@@ -330,7 +330,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
                 draft_args.get("draft_rope_scale"), 1.0
             )
 
-            # set draft rope alpha. Follows same behavior as model rope alpha.
+            # Set draft rope alpha. Follows same behavior as model rope alpha.
             # Use the max_position_embeddings of the model
             draft_rope_alpha = unwrap(draft_args.get("draft_rope_alpha"), "auto")
             if draft_rope_alpha == "auto":
@@ -341,7 +341,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
             else:
                 self.draft_config.scale_alpha_value = draft_rope_alpha
 
-            # set draft cache mode
+            # Set draft cache mode
             self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
 
             # Catch exllamav3 draft_cache_mode
@@ -836,7 +836,7 @@ async def unload(self, loras_only: bool = False, **kwargs):
                     await self.generator.close()
                     self.generator = None
 
-                # set all model state variables to False
+                # Set all model state variables to False
                 self.loaded = False
 
             gc.collect()
@@ -1136,15 +1136,15 @@ def assign_gen_params(
                 "top_k, top_p, and typical to 1.0, 1, 0, and 0."
             )
 
-        # set banned tokens
+        # Set banned tokens
         if params.banned_tokens:
             gen_settings.disallow_tokens(self.tokenizer, params.banned_tokens)
 
-        # set allowed tokens
+        # Set allowed tokens
         if params.allowed_tokens:
             gen_settings.allow_tokens(self.tokenizer, params.allowed_tokens)
 
-        # set logit bias
+        # Set logit bias
         if params.logit_bias:
             # Create a vocab tensor if it doesn't exist for token biasing
             if gen_settings.token_bias is None:
@@ -1261,7 +1261,7 @@ async def generate_gen(
             grammar_handler,
         )
 
-        # set banned strings
+        # Set banned strings
         banned_strings = params.banned_strings
         if banned_strings and len(grammar_handler.filters) > 0:
             logger.warning(
@@ -1271,7 +1271,7 @@ async def generate_gen(
 
             banned_strings = []
 
-        # set CFG scale and negative prompt
+        # Set CFG scale and negative prompt
         cfg_scale = params.cfg_scale
         negative_prompt = None
         if cfg_scale not in [None, 1.0]:
@@ -1301,7 +1301,7 @@ async def generate_gen(
         stop_conditions = params.stop
         ban_eos_token = params.ban_eos_token
 
-        # set add_bos_token for generation
+        # Set add_bos_token for generation
         add_bos_token = unwrap(params.add_bos_token, self.hf_model.add_bos_token())
 
         # Fetch EOS tokens from the HF model if they exist
@@ -1309,7 +1309,7 @@ async def generate_gen(
 
         # Ban the EOS token if specified. If not, append to stop conditions
         # as well.
-        # set this below logging to avoid polluting the stop strings array
+        # Set this below logging to avoid polluting the stop strings array
         if ban_eos_token:
             gen_settings.disallow_tokens(self.tokenizer, eos_tokens)
         else:
diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py
index 4b369c7f..389f581b 100644
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -157,12 +157,12 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
         gpu_device_list = list(range(0, gpu_count))
         use_tp = unwrap(kwargs.get("tensor_parallel"), False)
 
-        # set GPU split options
+        # Set GPU split options
         if gpu_count == 1:
             self.gpu_split_auto = False
             logger.info("Disabling GPU split because one GPU is in use.")
         else:
-            # set tensor parallel
+            # Set tensor parallel
             if use_tp:
                 self.use_tp = True
                 tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native")
@@ -179,7 +179,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
                 # TP has its own autosplit loader
                 self.gpu_split_auto = False
 
-            # set GPU split options
+            # Set GPU split options
             # Enable manual GPU split if provided
             if gpu_split:
                 self.gpu_split_auto = False
@@ -235,7 +235,7 @@ async def create(cls, model_directory: pathlib.Path, hf_model: HFModel, **kwargs
 
         # Draft cache
         if self.use_draft_model:
-            # set draft cache mode
+            # Set draft cache mode
             self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
             self.draft_cache = self.create_cache(
                 self.draft_cache_mode, self.draft_model
@@ -870,7 +870,7 @@ async def generate_gen(
 
         # Penalties
 
-        # set penalty range
+        # Set penalty range
         penalty_range = unwrap(params.penalty_range, self.max_seq_len)
 
         # Exl3's version of including the entire context
@@ -911,7 +911,7 @@ async def generate_gen(
             sampler_builder.temperature(params.temperature)
 
         # Build the sampler
-        # set greedy if temperature is 0
+        # Set greedy if temperature is 0
         sampler = sampler_builder.build(params.temperature == 0)
 
         # Dynamically scale penalty range to output tokens
diff --git a/colab/TabbyAPI_Colab_Example.ipynb b/colab/TabbyAPI_Colab_Example.ipynb
index c23bd345..b8e32f0a 100644
--- a/colab/TabbyAPI_Colab_Example.ipynb
+++ b/colab/TabbyAPI_Colab_Example.ipynb
@@ -184,13 +184,13 @@
         "  # Only use if your model was trained on long context with rope (check config.json)\n",
         "  rope_alpha: {RopeAlpha}\n",
         "\n",
-        "  # Disable Flash-attention 2. set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
+        "  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
         "  no_flash_attention: {NoFlashAttention}\n",
         "\n",
         "  # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n",
         "  cache_mode: {CacheMode}\n",
         "\n",
-        "  # set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n",
+        "  # Set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n",
         "  # NOTE: Only works with chat completion message lists!\n",
         "  prompt_template: {PromptTemplate}\n",
         "\n",
@@ -218,7 +218,7 @@
         "    # Overrides the directory to look for loras (default: loras)\n",
         "    lora_dir: loras\n",
         "\n",
-        "    # list of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n",
+        "    # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n",
         "    loras:\n",
         "    - name: {lora}\n",
         "      scaling: {LoraScaling}\n",
diff --git a/common/auth.py b/common/auth.py
index c986fc40..bd93afe0 100644
--- a/common/auth.py
+++ b/common/auth.py
@@ -51,7 +51,7 @@ async def load_auth_keys(disable_from_config: bool):
     if disable_from_config:
         logger.warning(
             "Disabling authentication makes your instance vulnerable. "
-            "set the `disable_auth` flag to False in config.yml if you "
+            "Set the `disable_auth` flag to False in config.yml if you "
             "want to share this instance with others."
         )
 
diff --git a/common/config_models.py b/common/config_models.py
index 781acdb3..c5772c42 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -250,7 +250,7 @@ class ModelConfig(BaseConfigModel):
         None,
         description=(
             "Rope alpha (default: None).\n"
-            'Same as alpha_value. set to "auto" to auto-calculate.\n'
+            'Same as alpha_value. Set to "auto" to auto-calculate.\n'
             "Leaving this value blank will either pull from the model "
             "or auto-calculate."
         ),
@@ -277,7 +277,7 @@ class ModelConfig(BaseConfigModel):
     max_batch_size: int | None = Field(
         None,
         description=(
-            "set the maximum number of prompts to process at one time "
+            "Set the maximum number of prompts to process at one time "
             "(default: None/Automatic).\n"
             "Automatically calculated if left blank.\n"
             "NOTE: Only available for Nvidia ampere (30 series) and above GPUs."
@@ -287,7 +287,7 @@ class ModelConfig(BaseConfigModel):
     prompt_template: str | None = Field(
         None,
         description=(
-            "set the prompt template for this model. (default: None)\n"
+            "Set the prompt template for this model. (default: None)\n"
             "If empty, attempts to look for the model's chat template.\n"
             "If a model contains multiple templates in its tokenizer_config.json,\n"
             "set prompt_template to the name of the template you want to use.\n"
@@ -335,7 +335,7 @@ class DraftModelConfig(BaseConfigModel):
         None,
         description=(
             "Rope alpha for draft models (default: None).\n"
-            'Same as alpha_value. set to "auto" to auto-calculate.\n'
+            'Same as alpha_value. Set to "auto" to auto-calculate.\n'
             "Leaving this value blank will either pull from the model "
             "or auto-calculate."
         ),
@@ -390,7 +390,7 @@ class LoraConfig(BaseConfigModel):
     loras: list[LoraInstanceModel] | None = Field(
         None,
         description=(
-            "list of LoRAs to load and associated scaling factors "
+            "List of LoRAs to load and associated scaling factors "
             "(default scale: 1.0).\n"
             "For the YAML file, add each entry as a YAML list:\n"
             "- name: lora1\n"
@@ -443,7 +443,7 @@ class DeveloperConfig(BaseConfigModel):
     realtime_process_priority: bool | None = Field(
         False,
         description=(
-            "set process to use a higher priority.\n"
+            "Set process to use a higher priority.\n"
             "For realtime process priority, run as administrator or sudo.\n"
             "Otherwise, the priority will be set to high."
         ),
diff --git a/common/tabby_config.py b/common/tabby_config.py
index 212746f1..535cb6b2 100644
--- a/common/tabby_config.py
+++ b/common/tabby_config.py
@@ -44,7 +44,7 @@ def load(self, arguments: dict | None = None):
             value = getattr(merged_config_model, field)
             setattr(self, field, value)
 
-        # set model defaults dict once to prevent on-demand reconstruction
+        # Set model defaults dict once to prevent on-demand reconstruction
         # TODO: clean this up a bit
         for field in self.model.use_as_default:
             if hasattr(self.model, field):
diff --git a/config_sample.yml b/config_sample.yml
index 88fa308a..0b65f9e8 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -79,7 +79,7 @@ model:
   backend:
 
   # Max sequence length (default: min(max_position_embeddings, cache_size)).
-  # set to -1 to fetch from the model's config.json
+  # Set to -1 to fetch from the model's config.json
   max_seq_len:
 
   # Size of the key/value cache to allocate, in tokens (default: 4096).
@@ -126,7 +126,7 @@ model:
   rope_scale: 1.0
 
   # Rope alpha (default: None).
-  # Same as alpha_value. set to "auto" to auto-calculate.
+  # Same as alpha_value. Set to "auto" to auto-calculate.
   # Leaving this value blank will either pull from the model or auto-calculate.
   rope_alpha:
 
@@ -141,12 +141,12 @@ model:
   # Used by EXL3 models only.
   output_chunking: true
 
-  # set the maximum number of prompts to process at one time (default: None/Automatic).
+  # Set the maximum number of prompts to process at one time (default: None/Automatic).
   # Automatically calculated if left blank.
   # NOTE: Only available for Nvidia ampere (30 series) and above GPUs.
   max_batch_size:
 
-  # set the prompt template for this model. (default: None)
+  # Set the prompt template for this model. (default: None)
   # If empty, attempts to look for the model's chat template.
   # If a model contains multiple templates in its tokenizer_config.json,
   # set prompt_template to the name of the template you want to use.
@@ -172,7 +172,7 @@ draft_model:
   draft_rope_scale: 1.0
 
   # Rope alpha for draft models (default: None).
-  # Same as alpha_value. set to "auto" to auto-calculate.
+  # Same as alpha_value. Set to "auto" to auto-calculate.
   # Leaving this value blank will either pull from the model or auto-calculate.
   draft_rope_alpha:
 
@@ -199,7 +199,7 @@ lora:
   # Directory to look for LoRAs (default: loras).
   lora_dir: loras
 
-  # list of LoRAs to load and associated scaling factors (default scale: 1.0).
+  # List of LoRAs to load and associated scaling factors (default scale: 1.0).
   # For the YAML file, add each entry as a YAML list:
   # - name: lora1
   #   scaling: 1.0
@@ -230,7 +230,7 @@ developer:
   # Disable API request streaming (default: False).
   disable_request_streaming: false
 
-  # set process to use a higher priority.
+  # Set process to use a higher priority.
   # For realtime process priority, run as administrator or sudo.
   # Otherwise, the priority will be set to high.
   realtime_process_priority: false
diff --git a/docker/Dockerfile b/docker/Dockerfile
index ba038cec..58aa61f5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,7 +20,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip
 
-# set the working directory in the container
+# Set the working directory in the container
 WORKDIR /app
 
 # Get requirements
@@ -37,7 +37,7 @@ COPY . .
 # Make port 5000 available to the world outside this container
 EXPOSE 5000
 
-# set the entry point
+# Set the entry point
 ENTRYPOINT ["python3"]
 
 # Run main.py when the container launches
diff --git a/docs/01.-Getting-Started.md b/docs/01.-Getting-Started.md
index 7e74abd9..330116c6 100644
--- a/docs/01.-Getting-Started.md
+++ b/docs/01.-Getting-Started.md
@@ -171,7 +171,7 @@ These are short-form instructions for other methods that users can use to instal
     2. For Windows: [Cuda Toolkit on WSL](https://docs.nvidia.com/cuda/wsl-user-guide/index.html)
 3. Clone TabbyAPI via `git clone https://github.com/theroyallab/tabbyAPI`
 4. Enter the tabbyAPI directory by `cd tabbyAPI`.
-	1. Optional: set up a config.yml or api_tokens.yml ([configuration](#configuration))
+	1. Optional: Set up a config.yml or api_tokens.yml ([configuration](#configuration))
 5. Update the volume mount section in the `docker/docker-compose.yml` file
 ```yml
 volumes:
diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md
index 03f866ad..00546e1b 100644
--- a/docs/02.-Server-options.md
+++ b/docs/02.-Server-options.md
@@ -16,8 +16,8 @@ All of these options have descriptive comments above them. You should not need t
 
 | Config Option          | Type (Default)         | Description                                                                                                             |
 | ---------------------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- |
-| host                   | String (127.0.0.1)     | set the IP address used for hosting TabbyAPI                                                                            |
-| port                   | Int (5000)             | set the TCP Port use for TabbyAPI                                                                                       |
+| host                   | String (127.0.0.1)     | Set the IP address used for hosting TabbyAPI                                                                            |
+| port                   | Int (5000)             | Set the TCP Port use for TabbyAPI                                                                                       |
 | disable_auth           | Bool (False)           | Disables API authentication                                                                                             |
 | disable_fetch_requests | Bool (False)           | Disables fetching external content when responding to requests (ex. fetching images from URLs)                          |
 | send_tracebacks        | Bool (False)           | Send server tracebacks to client.<br><br>Note: It's not recommended to enable this if sharing the instance with others. |
@@ -47,7 +47,7 @@ Note: These are experimental flags that may be removed at any point.
 | ------------------------- | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
 | unsafe_launch             | Bool (False)   | Skips dependency checks on startup. Only recommended for debugging.                                                                             |
 | disable_request_streaming | Bool (False)   | Forcefully disables streaming requests                                                                                                          |
-| realtime_process_priority | Bool (False)   | set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. |
+| realtime_process_priority | Bool (False)   | Set the process priority to "Realtime". Administrator/sudo access required, otherwise the priority is set to the highest it can go in userland. |
 
 ### Model Options
 
@@ -58,7 +58,7 @@ Note: Most of the options here will only apply on initial model load/startup (ep
 | model_dir             | String ("models")                | Directory to look for models.<br><br>Note: Persisted across subsequent load requests                                                                                                                                           |
 | inline_model_loading  | Bool (False)                     | Enables ability to switch models using the `model` argument in a generation request. More info in [Usage](https://github.com/theroyallab/tabbyAPI/wiki/03.-Usage#inline-loading)                                               |
 | use_dummy_models      | Bool (False)                     | Send a dummy OAI model card when calling the `/v1/models` endpoint. Used for clients which enforce specific OAI models.<br><br>Note: Persisted across subsequent load requests                                                 |
-| dummy_model_names     | list[String] (["gpt-3.5-turbo"]) | list of dummy names to send on model endpoint requests                                                                                                                                                                         |
+| dummy_model_names     | list[String] (["gpt-3.5-turbo"]) | List of dummy names to send on model endpoint requests                                                                                                                                                                         |
 | model_name            | String (None)                    | Folder name of a model to load. The below parameters will not apply unless this is filled out.                                                                                                                                 |
 | use_as_default        | list[String] ([])                | Keys to use by default when loading models. For example, putting `cache_mode` in this array will make every model load with that value unless specified by the API request.<br><br>Note: Also applies to the `draft` sub-block |
 | max_seq_len           | Float (None)                     | Maximum sequence length of the model. Uses the value from config.json if not specified here. Also called the max context length.                                                                                               |
@@ -95,7 +95,7 @@ Note: Sub-block of Mode Options. Same rules apply.
 | Config Option | Type (Default)   | Description                                                  |
 |---------------|------------------|--------------------------------------------------------------|
 | lora_dir      | String ("loras") | Directory to look for loras.<br><br>Note: Persisted across subsequent load requests |
-| loras         | list[loras] ([]) | list of lora objects to apply to the model. Each object contains a name and scaling. |
+| loras         | list[loras] ([]) | List of lora objects to apply to the model. Each object contains a name and scaling. |
 | name          | String (None)    | Folder name of a lora to load.<br><br>Note: An element of the `loras` key |
 | scaling       | Float (1.0)      | "Weight" to apply the lora on the parent model. For example, applying a lora with 0.9 scaling will lower the amount of application on the parent model.<br><br>Note: An element of the `loras` key |
 
diff --git a/docs/06.-Sharing.md b/docs/06.-Sharing.md
index a9f69559..86288d80 100644
--- a/docs/06.-Sharing.md
+++ b/docs/06.-Sharing.md
@@ -28,7 +28,7 @@ Tailscale is a product that uses the WireGuard protocol to provide a mesh networ
 > This is not a method for exposing your TabbyAPI instance to the world. If you want that, use the other two services.
 
 To get started:
-1. set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine.
+1. Set your TabbyAPI ip to `0.0.0.0` otherwise you will not be able to access your instance outside your local machine.
 2. Sign up and get started on [Tailscale's website](https://tailscale.com/), then install the client.
 3. Connect to your tailscale account on both your host and client machine.
 4. Select the Tailscale icon (usually in the system tray) and get the IP of your host device. This is usually identified by the hostname.
diff --git a/endpoints/OAI/router.py b/endpoints/OAI/router.py
index bb64a48a..8f4e7a4e 100644
--- a/endpoints/OAI/router.py
+++ b/endpoints/OAI/router.py
@@ -72,7 +72,7 @@ async def completion_request(
 
     disable_request_streaming = config.developer.disable_request_streaming
 
-    # set an empty JSON schema if the request wants a JSON response
+    # Set an empty JSON schema if the request wants a JSON response
     if data.response_format.type == "json":
         data.json_schema = {"type": "object"}
 
@@ -125,7 +125,7 @@ async def chat_completion_request(
 
     prompt, embeddings = await apply_chat_template(data)
 
-    # set an empty JSON schema if the request wants a JSON response
+    # Set an empty JSON schema if the request wants a JSON response
     if data.response_format.type == "json":
         data.json_schema = {"type": "object"}
 
diff --git a/endpoints/OAI/types/embedding.py b/endpoints/OAI/types/embedding.py
index 7eaefcf9..abb68fd7 100644
--- a/endpoints/OAI/types/embedding.py
+++ b/endpoints/OAI/types/embedding.py
@@ -9,7 +9,7 @@ class UsageInfo(BaseModel):
 
 class EmbeddingsRequest(BaseModel):
     input: str | list[str] = Field(
-        ..., description="list of input texts to generate embeddings for."
+        ..., description="List of input texts to generate embeddings for."
     )
     encoding_format: str = Field(
         "float",
@@ -25,7 +25,7 @@ class EmbeddingsRequest(BaseModel):
 class EmbeddingObject(BaseModel):
     object: str = Field("embedding", description="Type of the object.")
     embedding: list[float] | str = Field(
-        ..., description="Embedding values as a List of floats."
+        ..., description="Embedding values as a list of floats."
     )
     index: int = Field(
         ..., description="Index of the input text corresponding to the embedding."
diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
index 819a409c..157c96ba 100644
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -69,7 +69,7 @@ def _create_response(
 
             logprob_response = ChatCompletionLogprobs(content=collected_token_probs)
 
-        # set finish reason
+        # Set finish reason
         if message.tool_calls:
             finish_reason = "tool_calls"
         else:
diff --git a/endpoints/core/router.py b/endpoints/core/router.py
index 3b362a73..d69d683f 100644
--- a/endpoints/core/router.py
+++ b/endpoints/core/router.py
@@ -274,7 +274,7 @@ async def load_lora(data: LoraLoadRequest) -> LoraLoadResponse:
 
     if not data.loras:
         error_message = handle_request_error(
-            "list of loras to load is not found.",
+            "List of loras to load is not found.",
             exc_info=False,
         ).error.message
 
@@ -546,7 +546,7 @@ async def unload_template():
 @router.get("/v1/sampling/override/list", dependencies=[Depends(check_api_key)])
 async def list_sampler_overrides(request: Request) -> SamplerOverrideListResponse:
     """
-    list all currently applied sampler overrides.
+    List all currently applied sampler overrides.
 
     Requires an admin key to see all override presets.
     """
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
index 0f4252fb..ad0d8a85 100644
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -123,7 +123,7 @@ class ModelLoadRequest(BaseModel):
 class EmbeddingModelLoadRequest(BaseModel):
     embedding_model_name: str
 
-    # set default from the config
+    # Set default from the config
     embeddings_device: str | None = Field(config.embeddings.embeddings_device)
 
 
diff --git a/endpoints/core/utils/model.py b/endpoints/core/utils/model.py
index fa11186e..f2b39850 100644
--- a/endpoints/core/utils/model.py
+++ b/endpoints/core/utils/model.py
@@ -82,7 +82,7 @@ async def stream_model_load(
     # Get trimmed load data
     load_data = data.model_dump(exclude_none=True)
 
-    # set the draft model directory
+    # Set the draft model directory
     load_data.setdefault("draft_model", {})["draft_model_dir"] = (
         config.draft_model.draft_model_dir
     )
diff --git a/main.py b/main.py
index 1409c656..3f497901 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,6 @@
 """The main tabbyAPI module. Contains the FastAPI server and endpoints."""
 
-# set this env var for cuda malloc async before torch is initalized
+# Set this env var for cuda malloc async before torch is initalized
 import os
 
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
@@ -91,7 +91,7 @@ async def entrypoint_async():
 
     gen_logging.broadcast_status()
 
-    # set sampler parameter overrides if provided
+    # Set sampler parameter overrides if provided
     sampling_override_preset = config.sampling.override_preset
     if sampling_override_preset:
         try:
@@ -108,7 +108,7 @@ def entrypoint(
 ):
     setup_logger()
 
-    # set up signal aborting
+    # Set up signal aborting
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGTERM, signal_handler)
 
@@ -117,7 +117,7 @@ def entrypoint(
     else:
         from uvloop import install
 
-    # set loop event policy
+    # Set loop event policy
     install()
 
     # Parse and override config from args
@@ -157,7 +157,7 @@ def entrypoint(
 
         raise SystemExit(install_message)
 
-    # set the process priority
+    # Set the process priority
     if config.developer.realtime_process_priority:
         import psutil
 
diff --git a/start.py b/start.py
index 20c4d489..4811c9ac 100644
--- a/start.py
+++ b/start.py
@@ -203,10 +203,10 @@ def run_pip(command: list[str]):
             "Getting things ready..."
         )
 
-    # set variables that rely on start options
+    # Set variables that rely on start options
     first_run = not start_options.get("first_run_done")
 
-    # set gpu_lib for dependency install
+    # Set gpu_lib for dependency install
     if args.gpu_lib:
         print("Overriding GPU lib name from args.")
         gpu_lib = args.gpu_lib