File tree Expand file tree Collapse file tree
02_ml_inference/01_text_generation Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1- ## LLM chat inference on a serverless GPU
1+ ## LLM chat inference on a serverless GPU
22# This example runs a small chat LLM (Llama 3.2 1B Instruct) on Runpod serverless GPUs
33# using `transformers.pipeline`.
44#
@@ -60,8 +60,8 @@ async def gpu_hello(
6060 input_data : dict ,
6161) -> dict :
6262 """Generate one chat response using Llama 3.2 1B Instruct on a serverless GPU."""
63- import platform
6463 import os
64+ import platform
6565 from datetime import datetime
6666
6767 import torch
@@ -137,7 +137,9 @@ class MessageRequest(BaseModel):
137137 """Request model for GPU worker."""
138138
139139 message : str = "What is gpu?"
140- system_prompt : str = "You are a helpful assistant chatbot who always responds in a friendly and helpful manner!"
140+ system_prompt : str = (
141+ "You are a helpful assistant chatbot who always responds in a friendly and helpful manner!"
142+ )
141143 max_new_tokens : int = 512
142144
143145
You can’t perform that action at this time.
0 commit comments