Skip to content

Commit 9756b1f

Browse files
Update the Quant for the llama.cpp .
1 parent db80817 commit 9756b1f

File tree

2 files changed

+155
-51
lines changed

2 files changed

+155
-51
lines changed

quantllm/api/high_level.py

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -297,13 +297,25 @@ def quantize_from_pretrained(
297297
def save_quantized_model(
298298
model: PreTrainedModel,
299299
output_path: str,
300+
save_format: str = "gguf",
300301
save_tokenizer: bool = True,
301-
quant_config: Optional[Dict[str, Any]] = None
302+
quant_config: Optional[Dict[str, Any]] = None,
303+
safe_serialization: bool = True
302304
):
303-
"""Save a quantized model in GGUF format."""
305+
"""
306+
Save a quantized model in either GGUF or safetensors format.
307+
308+
Args:
309+
model: The quantized model to save
310+
output_path: Path to save the model
311+
save_format: Format to save in ("gguf" or "safetensors")
312+
save_tokenizer: Whether to save the tokenizer
313+
quant_config: Optional quantization configuration
314+
safe_serialization: Whether to use safe serialization for safetensors format
315+
"""
304316
try:
305317
logger.log_info("\n" + "="*80)
306-
logger.log_info("Starting GGUF Export Process".center(80))
318+
logger.log_info(f"Starting Model Export Process ({save_format.upper()})".center(80))
307319
logger.log_info("="*80 + "\n")
308320

309321
# Log model details
@@ -315,6 +327,7 @@ def save_quantized_model(
315327
logger.log_info(f"• Architecture: {model.config.model_type}")
316328
logger.log_info(f"• Total Parameters: {total_params:,}")
317329
logger.log_info(f"• Model Size: {model_size_gb:.2f} GB")
330+
logger.log_info(f"• Export Format: {save_format.upper()}")
318331
logger.log_info("")
319332

320333
# Get quantization info
@@ -350,23 +363,43 @@ def save_quantized_model(
350363
output_dir = os.path.dirname(output_path) or "."
351364
os.makedirs(output_dir, exist_ok=True)
352365

353-
# Convert to GGUF using the new converter
354-
from ..quant.llama_cpp_utils import LlamaCppConverter
355-
356-
converter = LlamaCppConverter()
357-
gguf_path = converter.convert_to_gguf(
358-
model=model,
359-
output_dir=output_dir,
360-
bits=quant_config['bits'],
361-
group_size=quant_config.get('group_size', 128),
362-
save_tokenizer=save_tokenizer
363-
)
366+
if save_format.lower() == "gguf":
367+
# Convert to GGUF using the converter
368+
from ..quant.llama_cpp_utils import LlamaCppConverter
369+
370+
converter = LlamaCppConverter()
371+
gguf_path = converter.convert_to_gguf(
372+
model=model,
373+
output_dir=output_dir,
374+
bits=quant_config['bits'],
375+
group_size=quant_config.get('group_size', 128),
376+
save_tokenizer=save_tokenizer
377+
)
378+
379+
logger.log_info("\n✨ GGUF export completed successfully!")
380+
381+
else: # safetensors format
382+
logger.log_info("\n💾 Saving model in safetensors format:")
383+
logger.log_info("-"*40)
384+
385+
# Save the model
386+
model.save_pretrained(
387+
output_dir,
388+
safe_serialization=safe_serialization
389+
)
390+
logger.log_info("• Model weights saved successfully")
391+
392+
# Save tokenizer if requested
393+
if save_tokenizer and hasattr(model, 'tokenizer'):
394+
logger.log_info("• Saving tokenizer...")
395+
model.tokenizer.save_pretrained(output_dir)
396+
397+
logger.log_info("\n✨ Safetensors export completed successfully!")
364398

365-
logger.log_info("\n✨ Model export completed successfully!")
366399
logger.log_info("="*80)
367400

368401
except Exception as e:
369-
logger.log_error(f"Failed to save model: {str(e)}")
402+
logger.log_error(f"\nFailed to save model: {str(e)}")
370403
raise
371404
finally:
372405
if torch.cuda.is_available():

quantllm/quant/llama_cpp_utils.py

Lines changed: 106 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -20,45 +20,96 @@ class LlamaCppConverter:
2020
"gpt_neox", "pythia", "stablelm", "phi"
2121
]
2222

23+
LLAMA_CPP_REPO = "https://github.com/ggerganov/llama.cpp.git"
24+
CONVERT_SCRIPTS = [
25+
"convert_hf_to_gguf.py",
26+
"convert_hf_to_gguf_update.py",
27+
"convert_llama_to_gguf.py",
28+
"convert_lora_to_gguf.py"
29+
]
30+
2331
def __init__(self):
32+
self.llama_cpp_path = None
33+
self.convert_script = None
2434
self._setup_paths()
2535

2636
def _setup_paths(self):
2737
"""Setup llama.cpp paths and environment."""
28-
self.llama_cpp_path = None
29-
self.convert_script = None
38+
# First check if we already have llama.cpp cloned
39+
potential_paths = [
40+
Path.cwd() / "llama.cpp",
41+
Path.home() / "llama.cpp",
42+
Path(os.getenv("LLAMA_CPP_DIR", "")) if os.getenv("LLAMA_CPP_DIR") else None
43+
]
3044

31-
# Try to find llama.cpp installation
32-
try:
33-
import llama_cpp
34-
self.llama_cpp_path = Path(llama_cpp.__file__).parent
35-
potential_paths = [
36-
self.llama_cpp_path / "convert.py",
37-
self.llama_cpp_path / "llama_cpp" / "convert.py",
38-
Path(sys.prefix) / "llama_cpp_python" / "convert.py",
39-
]
40-
41-
for path in potential_paths:
42-
if path.exists():
43-
self.convert_script = str(path)
44-
break
45+
for path in potential_paths:
46+
if path and path.exists():
47+
# Check for any of the conversion scripts
48+
for script in self.CONVERT_SCRIPTS:
49+
script_path = path / script
50+
if script_path.exists():
51+
self.llama_cpp_path = path
52+
self.convert_script = str(script_path)
53+
logger.log_info(f"✓ Found existing llama.cpp installation at: {path}")
54+
logger.log_info(f"✓ Using conversion script: {script}")
55+
return
4556

46-
except ImportError:
47-
pass
57+
# If not found, we'll clone it
58+
self._clone_llama_cpp()
4859

49-
def _install_llama_cpp(self) -> bool:
50-
"""Install llama-cpp-python package."""
60+
def _clone_llama_cpp(self):
61+
"""Clone llama.cpp from GitHub."""
5162
try:
52-
logger.log_info("📦 Installing llama-cpp-python...")
53-
subprocess.check_call([
54-
sys.executable, "-m", "pip", "install",
55-
"--upgrade", "llama-cpp-python"
56-
])
57-
self._setup_paths()
58-
return self.convert_script is not None
63+
logger.log_info("\n🔄 Setting up llama.cpp:")
64+
logger.log_info("-" * 40)
65+
66+
# Create a directory for llama.cpp
67+
self.llama_cpp_path = Path.cwd() / "llama.cpp"
68+
if self.llama_cpp_path.exists():
69+
logger.log_info("• Cleaning existing llama.cpp directory...")
70+
shutil.rmtree(self.llama_cpp_path)
71+
72+
# Clone the repository
73+
logger.log_info("• Cloning llama.cpp repository...")
74+
subprocess.run(
75+
["git", "clone", self.LLAMA_CPP_REPO],
76+
check=True,
77+
stdout=subprocess.PIPE,
78+
stderr=subprocess.PIPE
79+
)
80+
81+
# Verify the clone
82+
if not self.llama_cpp_path.exists():
83+
raise RuntimeError("Failed to clone llama.cpp repository")
84+
85+
# Find appropriate conversion script
86+
for script in self.CONVERT_SCRIPTS:
87+
script_path = self.llama_cpp_path / script
88+
if script_path.exists():
89+
self.convert_script = str(script_path)
90+
logger.log_info(f"• Found conversion script: {script}")
91+
break
92+
93+
if not self.convert_script:
94+
# List available files for debugging
95+
available_files = list(self.llama_cpp_path.glob("convert*.py"))
96+
logger.log_info("• Available conversion scripts:")
97+
for file in available_files:
98+
logger.log_info(f" - {file.name}")
99+
raise RuntimeError(
100+
"Could not find appropriate conversion script in llama.cpp. "
101+
f"Available scripts: {[f.name for f in available_files]}"
102+
)
103+
104+
logger.log_info("• Successfully set up llama.cpp")
105+
logger.log_info(f"• Convert script location: {self.convert_script}")
106+
59107
except Exception as e:
60-
logger.log_error(f"Failed to install llama-cpp-python: {e}")
61-
return False
108+
logger.log_error(f"Failed to clone/setup llama.cpp: {e}")
109+
raise RuntimeError(
110+
"Could not set up llama.cpp. Please clone manually:\n"
111+
"git clone https://github.com/ggerganov/llama.cpp.git"
112+
)
62113

63114
def _detect_model_type(self, model: PreTrainedModel) -> str:
64115
"""Detect model architecture type."""
@@ -99,7 +150,28 @@ def _save_model_config(self, model: PreTrainedModel, save_dir: str):
99150

100151
# Add quantization info if available
101152
if hasattr(model.config, 'quantization_config'):
102-
minimal_config['quantization_config'] = model.config.quantization_config
153+
quant_config = model.config.quantization_config
154+
if isinstance(quant_config, dict):
155+
minimal_config['quantization_config'] = quant_config
156+
else:
157+
# Convert BitsAndBytesConfig to dict
158+
minimal_config['quantization_config'] = {
159+
'bits': quant_config.bits if hasattr(quant_config, 'bits') else None,
160+
'group_size': quant_config.group_size if hasattr(quant_config, 'group_size') else None,
161+
'quant_method': quant_config.quant_method if hasattr(quant_config, 'quant_method') else None,
162+
'load_in_4bit': quant_config.load_in_4bit if hasattr(quant_config, 'load_in_4bit') else False,
163+
'load_in_8bit': quant_config.load_in_8bit if hasattr(quant_config, 'load_in_8bit') else False,
164+
'llm_int8_threshold': quant_config.llm_int8_threshold if hasattr(quant_config, 'llm_int8_threshold') else 6.0,
165+
'llm_int8_has_fp16_weight': quant_config.llm_int8_has_fp16_weight if hasattr(quant_config, 'llm_int8_has_fp16_weight') else False,
166+
'bnb_4bit_compute_dtype': str(quant_config.bnb_4bit_compute_dtype) if hasattr(quant_config, 'bnb_4bit_compute_dtype') else None,
167+
'bnb_4bit_quant_type': quant_config.bnb_4bit_quant_type if hasattr(quant_config, 'bnb_4bit_quant_type') else None,
168+
'bnb_4bit_use_double_quant': quant_config.bnb_4bit_use_double_quant if hasattr(quant_config, 'bnb_4bit_use_double_quant') else False
169+
}
170+
# Remove None values
171+
minimal_config['quantization_config'] = {
172+
k: v for k, v in minimal_config['quantization_config'].items()
173+
if v is not None
174+
}
103175

104176
config_path = os.path.join(save_dir, "config.json")
105177
with open(config_path, 'w') as f:
@@ -132,11 +204,10 @@ def convert_to_gguf(
132204
# Create output directory
133205
os.makedirs(output_dir, exist_ok=True)
134206

135-
# Setup paths
136-
if not self.convert_script and not self._install_llama_cpp():
207+
# Ensure llama.cpp is available
208+
if not self.convert_script:
137209
raise RuntimeError(
138-
"Could not find or install llama-cpp-python. "
139-
"Please install manually: pip install llama-cpp-python"
210+
"llama.cpp conversion script not found. Please ensure llama.cpp is properly set up."
140211
)
141212

142213
# Detect model type
@@ -145,7 +216,7 @@ def convert_to_gguf(
145216

146217
# Create temporary directory for minimal checkpoint
147218
with tempfile.TemporaryDirectory() as temp_dir:
148-
logger.log_info("💾 Preparing model for conversion:")
219+
logger.log_info("\n💾 Preparing model for conversion:")
149220
logger.log_info("-" * 40)
150221

151222
# Save minimal checkpoint

0 commit comments

Comments
 (0)