phi3-procurement-lora/inference.py at master · rustammdev/phi3-procurement-lora · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Phi-3-mini + LoRA Adapter Inference Script
Trained model bilan ishlash uchun
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# ============== KONFIGURATSIYA ==============
BASE_MODEL = "microsoft/Phi-3-mini-4k-instruct"
ADAPTER_PATH = "./procurement-lora"
MAX_NEW_TOKENS = 512
# ============================================


def load_model(adapter_path: str = ADAPTER_PATH):
    """Model va adapter yuklash."""
    print("Model yuklanmoqda...")

    # 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    # Base model
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="eager",
    )

    # LoRA adapter
    model = PeftModel.from_pretrained(model, adapter_path)
    model.eval()

    print("Model tayyor!")
    return model, tokenizer


def generate(
    model,
    tokenizer,
    instruction: str,
    input_text: str = "",
    max_new_tokens: int = MAX_NEW_TOKENS,
) -> str:
    """Javob generatsiya qilish."""

    # Prompt yaratish
    if input_text:
        prompt = f"""<|user|>
{instruction}

Ma'lumot:
{input_text}<|end|>
<|assistant|>
"""
    else:
        prompt = f"""<|user|>
{instruction}<|end|>
<|assistant|>
"""

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False,  # Cache muammosini oldini olish
        )

    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Faqat assistant javobini olish
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>")[-1]
    if "<|end|>" in response:
        response = response.split("<|end|>")[0]

    return response.strip()


def interactive_chat(model, tokenizer):
    """Interactive chat mode."""
    print("\n" + "=" * 50)
    print("PROCUREMENT ASSISTANT")
    print("Chiqish uchun 'exit' yoki 'q' yozing")
    print("=" * 50)

    while True:
        print("\n")
        instruction = input("Savol: ").strip()

        if instruction.lower() in ["exit", "q", "quit", "chiqish"]:
            print("Xayr!")
            break

        if not instruction:
            continue

        # Input so'rash (ixtiyoriy)
        input_text = input("Ma'lumot (bo'sh qoldiring agar kerak bo'lmasa): ").strip()

        print("\nJavob generatsiya qilinmoqda...")
        response = generate(model, tokenizer, instruction, input_text)

        print("\n" + "-" * 40)
        print("JAVOB:")
        print("-" * 40)
        print(response)


def batch_inference(model, tokenizer, questions: list) -> list:
    """Bir nechta savollarga javob berish."""
    results = []

    for i, q in enumerate(questions):
        print(f"Processing {i+1}/{len(questions)}...")

        instruction = q.get("instruction", "")
        input_text = q.get("input", "")

        response = generate(model, tokenizer, instruction, input_text)
        results.append({
            "instruction": instruction,
            "input": input_text,
            "response": response,
        })

    return results


def main():
    # Model yuklash
    model, tokenizer = load_model()

    # Interactive chat
    interactive_chat(model, tokenizer)


if __name__ == "__main__":
    main()