-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinference.py
More file actions
161 lines (124 loc) · 3.88 KB
/
inference.py
File metadata and controls
161 lines (124 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Phi-3-mini + LoRA Adapter Inference Script
Trained model bilan ishlash uchun
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
# ============== KONFIGURATSIYA ==============
BASE_MODEL = "microsoft/Phi-3-mini-4k-instruct"
ADAPTER_PATH = "./procurement-lora"
MAX_NEW_TOKENS = 512
# ============================================
def load_model(adapter_path: str = ADAPTER_PATH):
"""Model va adapter yuklash."""
print("Model yuklanmoqda...")
# 4-bit quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# Base model
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
attn_implementation="eager",
)
# LoRA adapter
model = PeftModel.from_pretrained(model, adapter_path)
model.eval()
print("Model tayyor!")
return model, tokenizer
def generate(
model,
tokenizer,
instruction: str,
input_text: str = "",
max_new_tokens: int = MAX_NEW_TOKENS,
) -> str:
"""Javob generatsiya qilish."""
# Prompt yaratish
if input_text:
prompt = f"""<|user|>
{instruction}
Ma'lumot:
{input_text}<|end|>
<|assistant|>
"""
else:
prompt = f"""<|user|>
{instruction}<|end|>
<|assistant|>
"""
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
pad_token_id=tokenizer.eos_token_id,
use_cache=False, # Cache muammosini oldini olish
)
# Decode
response = tokenizer.decode(outputs[0], skip_special_tokens=False)
# Faqat assistant javobini olish
if "<|assistant|>" in response:
response = response.split("<|assistant|>")[-1]
if "<|end|>" in response:
response = response.split("<|end|>")[0]
return response.strip()
def interactive_chat(model, tokenizer):
"""Interactive chat mode."""
print("\n" + "=" * 50)
print("PROCUREMENT ASSISTANT")
print("Chiqish uchun 'exit' yoki 'q' yozing")
print("=" * 50)
while True:
print("\n")
instruction = input("Savol: ").strip()
if instruction.lower() in ["exit", "q", "quit", "chiqish"]:
print("Xayr!")
break
if not instruction:
continue
# Input so'rash (ixtiyoriy)
input_text = input("Ma'lumot (bo'sh qoldiring agar kerak bo'lmasa): ").strip()
print("\nJavob generatsiya qilinmoqda...")
response = generate(model, tokenizer, instruction, input_text)
print("\n" + "-" * 40)
print("JAVOB:")
print("-" * 40)
print(response)
def batch_inference(model, tokenizer, questions: list) -> list:
"""Bir nechta savollarga javob berish."""
results = []
for i, q in enumerate(questions):
print(f"Processing {i+1}/{len(questions)}...")
instruction = q.get("instruction", "")
input_text = q.get("input", "")
response = generate(model, tokenizer, instruction, input_text)
results.append({
"instruction": instruction,
"input": input_text,
"response": response,
})
return results
def main():
# Model yuklash
model, tokenizer = load_model()
# Interactive chat
interactive_chat(model, tokenizer)
if __name__ == "__main__":
main()