Skip to content

Commit 2007749

Browse files
Update the Quant for the llama.cpp .
1 parent 9756b1f commit 2007749

File tree

2 files changed

+100
-260
lines changed

2 files changed

+100
-260
lines changed

quantllm/api/high_level.py

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,8 @@ def save_quantized_model(
300300
save_format: str = "gguf",
301301
save_tokenizer: bool = True,
302302
quant_config: Optional[Dict[str, Any]] = None,
303-
safe_serialization: bool = True
303+
safe_serialization: bool = True,
304+
verbose: bool = False
304305
):
305306
"""
306307
Save a quantized model in either GGUF or safetensors format.
@@ -312,52 +313,49 @@ def save_quantized_model(
312313
save_tokenizer: Whether to save the tokenizer
313314
quant_config: Optional quantization configuration
314315
safe_serialization: Whether to use safe serialization for safetensors format
316+
verbose: Whether to show detailed progress logs
315317
"""
316318
try:
317-
logger.log_info("\n" + "="*80)
318-
logger.log_info(f"Starting Model Export Process ({save_format.upper()})".center(80))
319-
logger.log_info("="*80 + "\n")
320-
321-
# Log model details
322-
total_params = sum(p.numel() for p in model.parameters())
323-
model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
324-
325-
logger.log_info("📊 Model Information:")
326-
logger.log_info("-"*40)
327-
logger.log_info(f"• Architecture: {model.config.model_type}")
328-
logger.log_info(f"• Total Parameters: {total_params:,}")
329-
logger.log_info(f"• Model Size: {model_size_gb:.2f} GB")
330-
logger.log_info(f"• Export Format: {save_format.upper()}")
331-
logger.log_info("")
319+
if not verbose:
320+
logger.log_info(f"Converting model to {save_format.upper()} format...")
321+
else:
322+
logger.log_info("\n" + "="*80)
323+
logger.log_info(f"Starting Model Export Process ({save_format.upper()})".center(80))
324+
logger.log_info("="*80 + "\n")
325+
326+
# Log model details
327+
total_params = sum(p.numel() for p in model.parameters())
328+
model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
329+
330+
logger.log_info("📊 Model Information:")
331+
logger.log_info("-"*40)
332+
logger.log_info(f"• Architecture: {model.config.model_type}")
333+
logger.log_info(f"• Total Parameters: {total_params:,}")
334+
logger.log_info(f"• Model Size: {model_size_gb:.2f} GB")
335+
logger.log_info(f"• Export Format: {save_format.upper()}")
336+
logger.log_info("")
332337

333338
# Get quantization info
334-
if not quant_config:
335-
if hasattr(model.config, 'quantization_config'):
336-
config_dict = model.config.quantization_config
337-
if isinstance(config_dict, BitsAndBytesConfig):
338-
# Handle BitsAndBytesConfig
339-
bits = 4 if config_dict.load_in_4bit else (8 if config_dict.load_in_8bit else 16)
340-
quant_config = {
341-
'bits': bits,
342-
'group_size': 128, # Default group size
343-
'quant_type': f"Q{bits}_K_M" if bits <= 8 else "F16"
344-
}
339+
if not quant_config and hasattr(model.config, 'quantization_config'):
340+
config_dict = model.config.quantization_config
341+
if isinstance(config_dict, BitsAndBytesConfig):
342+
bits = 4 if config_dict.load_in_4bit else (8 if config_dict.load_in_8bit else 16)
343+
quant_config = {
344+
'bits': bits,
345+
'group_size': 128,
346+
'quant_type': f"Q{bits}_K_M" if bits <= 8 else "F16"
347+
}
348+
if verbose:
345349
logger.log_info("📊 Quantization Configuration:")
346350
logger.log_info("-"*40)
347351
logger.log_info(f"• Bits: {bits}")
348352
logger.log_info(f"• Quantization Type: {quant_config['quant_type']}")
349353
if config_dict.load_in_4bit:
350354
logger.log_info(f"• 4-bit Type: {config_dict.bnb_4bit_quant_type}")
351355
logger.log_info(f"• Compute dtype: {config_dict.bnb_4bit_compute_dtype}")
352-
else:
353-
quant_config = config_dict
356+
logger.log_info("")
354357
else:
355-
logger.log_info("\nUsing default 4-bit quantization settings")
356-
quant_config = {
357-
'bits': 4,
358-
'group_size': 128,
359-
'quant_type': "Q4_K_M"
360-
}
358+
quant_config = config_dict
361359

362360
# Create output directory
363361
output_dir = os.path.dirname(output_path) or "."
@@ -371,35 +369,38 @@ def save_quantized_model(
371369
gguf_path = converter.convert_to_gguf(
372370
model=model,
373371
output_dir=output_dir,
374-
bits=quant_config['bits'],
375-
group_size=quant_config.get('group_size', 128),
372+
bits=quant_config['bits'] if quant_config else 4,
373+
group_size=quant_config.get('group_size', 128) if quant_config else 128,
376374
save_tokenizer=save_tokenizer
377375
)
378376

379-
logger.log_info("\n✨ GGUF export completed successfully!")
377+
if verbose:
378+
file_size = os.path.getsize(gguf_path) / (1024**3)
379+
logger.log_info(f"\nGGUF model saved ({file_size:.2f} GB): {gguf_path}")
380+
else:
381+
logger.log_info("✓ GGUF conversion completed successfully!")
380382

381383
else: # safetensors format
382-
logger.log_info("\n💾 Saving model in safetensors format:")
383-
logger.log_info("-"*40)
384+
if verbose:
385+
logger.log_info("\n💾 Saving model in safetensors format...")
384386

385387
# Save the model
386388
model.save_pretrained(
387389
output_dir,
388390
safe_serialization=safe_serialization
389391
)
390-
logger.log_info("• Model weights saved successfully")
391392

392393
# Save tokenizer if requested
393394
if save_tokenizer and hasattr(model, 'tokenizer'):
394-
logger.log_info("• Saving tokenizer...")
395395
model.tokenizer.save_pretrained(output_dir)
396396

397-
logger.log_info("\n✨ Safetensors export completed successfully!")
398-
399-
logger.log_info("="*80)
397+
if verbose:
398+
logger.log_info("✓ Model saved successfully in safetensors format!")
399+
else:
400+
logger.log_info("✓ Model saved successfully!")
400401

401402
except Exception as e:
402-
logger.log_error(f"\nFailed to save model: {str(e)}")
403+
logger.log_error(f"Failed to save model: {str(e)}")
403404
raise
404405
finally:
405406
if torch.cuda.is_available():

0 commit comments

Comments
 (0)