Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 1 addition & 27 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,33 +693,7 @@ def tune(args):
trust_remote_code=not args.disable_trust_remote_code,
)

model_name = args.model.rstrip("/")

if model_name.split("/")[-1].strip(".") == "" and "gguf" not in args.format:
if autoround.group_size <= 0:
if "fp" in autoround.act_data_type:
suffix = f"afp{autoround.act_bits}"
else:
suffix = f"a{autoround.act_bits}"
else:
suffix = f"g{autoround.group_size}"
export_dir = os.path.join(args.output_dir, f"w{autoround.bits}{suffix}")
elif model_name.split("/")[-1].strip(".") == "" and "gguf" in args.format:
export_dir = args.output_dir
elif model_name.split("./")[-1].strip("./") != "" and "gguf" in args.format:
export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + "-gguf")
else:
if autoround.group_size <= 0:
if "fp" in autoround.act_data_type:
suffix = f"afp{autoround.act_bits}"
else:
suffix = f"a{autoround.act_bits}"
else:
suffix = f"g{autoround.group_size}"
export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}")

# ======================= Quantize and save model =======================
model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101
model, folders = autoround.quantize_and_save(args.output_dir, format=args.format) # pylint: disable=E1101
tokenizer = autoround.tokenizer # pylint: disable=E1101

model.eval()
Expand Down
198 changes: 128 additions & 70 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,20 +307,18 @@ def __init__(
self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
self.quant_lm_head = kwargs.pop("quant_lm_head", False)

# should be set after loading model and set layer_config, cause some special scheme need these.
# Preserve the original, unparsed scheme for later use in auto scheme generation
# within `configure_layer_config` (which may need the raw value instead of `self.scheme`).
self.scheme = scheme
self.orig_scheme = copy.deepcopy(scheme)
self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs)

gguf_scheme_name = get_gguf_scheme(self.scheme)
# GGUF uses fp32 scale dtype as default
scale_dtype = kwargs.pop("scale_dtype", None)
if scale_dtype is None:
scale_dtype = "fp32" if gguf_scheme_name else "fp16"
self.is_auto_scheme = True if isinstance(scheme, AutoScheme) else False
self.scale_dtype = kwargs.pop("scale_dtype", None)

# Extra/legacy kwargs for backward compatibility
# Major version releases may pack them with extra configuration options
scheme_keys = [f.name for f in fields(QuantizationScheme)]
for key in scheme_keys:
if key in kwargs and kwargs[key] is not None:
setattr(self, key, kwargs.pop(key))

amp = kwargs.pop("amp", True)
lr = kwargs.pop("lr", None)
enable_minmax_tuning = kwargs.pop("enable_minmax_tuning", True)
Expand All @@ -344,10 +342,9 @@ def __init__(
self.platform = platform

self.ignore_layers = kwargs.pop("ignore_layers", "")
self.supported_types = SUPPORTED_LAYER_TYPES
self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
self.scale_dtype = convert_dtype_str2torch(scale_dtype)

self.low_cpu_mem_usage = low_cpu_mem_usage
self.block_forward = block_forward

if kwargs:
logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
Expand Down Expand Up @@ -379,16 +376,10 @@ def __init__(
self.device_map = device_map
if isinstance(self.device_map, str):
self.device_map = self.device_map.replace(" ", "")

self.device_list = parse_available_devices(device_map)

# Set device, must place after model loading
self.device = get_major_device(device_map)
set_non_auto_device_map(self.model, self.device_map)
self.device = get_major_device(self.device_map)

# Tuning hyperparameters
self.seed = seed
set_seed(self.seed)
self.amp = amp
self.enable_quanted_input = enable_quanted_input
self.enable_minmax_tuning = enable_minmax_tuning
Expand Down Expand Up @@ -423,24 +414,7 @@ def __init__(
if enable_opt_rtn:
disable_opt_rtn = False
self.orig_disable_opt_rtn = disable_opt_rtn

if self.iters != 0 and self.orig_disable_opt_rtn is not None:
logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
disable_opt_rtn = True
if (
self.bits >= 8
and self.act_bits >= 16
and self.iters == 0
and self.data_type == "int"
and disable_opt_rtn is None
):
logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
disable_opt_rtn = True
if disable_opt_rtn is None and self.iters == 0:
logger.info(
"`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
)
disable_opt_rtn = False
self.disable_opt_rtn = disable_opt_rtn

# Important Note! This is not very robust, do NOT rely on it to do high risky thing
self.is_moe_model = is_moe_model(self.model)
Expand All @@ -451,7 +425,6 @@ def __init__(
self.dynamic_max_gap = dynamic_max_gap
self.lr_scheduler = lr_scheduler
self.optimizer = self._get_optimizer(None)
self.disable_opt_rtn = disable_opt_rtn

# Whether to pack the layer immediately after tuning
self.is_immediate_packing = False
Expand All @@ -467,8 +440,78 @@ def __init__(
if self.static_attention_dtype is not None:
logger.warning("The static attention dtype is experimental and currently has limited support.")

self._set_amp_dtype()
self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device

self.batch_dim = None
self.infer_bs_coeff = 1

# after setting iters
self.enable_torch_compile = enable_torch_compile

self.attention_mask = []
self.wrapper_block = wrapper_block

torch.set_printoptions(precision=3, sci_mode=True)

self._post_inited = False

def _post_init(self) -> None:
"""Post-initialization for AutoRound."""
if self._post_inited:
return

# should be set after loading model and set layer_config, cause some special scheme need these.
# Preserve the original, unparsed scheme for later use in auto scheme generation
# within `configure_layer_config` (which may need the raw value instead of `self.scheme`).
self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme)

# GGUF uses fp32 scale dtype as default
if self.scale_dtype is None:
gguf_scheme_name = get_gguf_scheme(self.scheme)
scale_dtype = "fp32" if gguf_scheme_name else "fp16"
else:
scale_dtype = self.scale_dtype
self.scale_dtype = convert_dtype_str2torch(scale_dtype)

predefined_ignore_layers = get_predefined_ignore_layers(self.model)

if predefined_ignore_layers:
logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}")
tmp_str = ",".join(predefined_ignore_layers)
if self.ignore_layers == "":
self.ignore_layers = tmp_str
else:
self.ignore_layers += "," + tmp_str

# Set device, must place after model loading
self._set_device(self.device_map)
set_non_auto_device_map(self.model, self.device_map)
self.device_list = parse_available_devices(self.device_map)

if self.iters != 0 and self.orig_disable_opt_rtn is not None:
logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
self.disable_opt_rtn = True
if (
self.bits >= 8
and self.act_bits >= 16
and self.iters == 0
and self.data_type == "int"
and self.disable_opt_rtn is None
):
logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
self.disable_opt_rtn = True
if self.disable_opt_rtn is None and self.iters == 0:
logger.info(
"`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
)
self.disable_opt_rtn = False

set_seed(self.seed)
self._set_amp_dtype()
self._adjust_torch_compile(self.enable_torch_compile)
if self.enable_torch_compile:
self.block_forward = compile_func(self.block_forward, self.device)

if self.act_bits <= 8 and self.amp_dtype == torch.float16:
logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization")
self.amp_dtype = torch.bfloat16
Expand All @@ -480,24 +523,16 @@ def __init__(
# Some helpers
if "hpu" in str(self.device):
self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
self.batch_dim = None
self.infer_bs_coeff = 1

# after setting iters
self.enable_torch_compile = enable_torch_compile
self._adjust_torch_compile(enable_torch_compile)

self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward
self._check_configs()
torch.set_printoptions(precision=3, sci_mode=True)

if isinstance(self.scheme, AutoScheme):
self.layer_config = self._gen_auto_scheme(self.model, self.orig_scheme, self.dataset, self.device_map)

if is_hpex_available():
logger.info("habana_frameworks is available, import htcore explicitly.")
import habana_frameworks.torch.core as htcore # pylint: disable=E0401

self.attention_mask = []

self.wrapper_block = wrapper_block
if self.enable_alg_ext:
try:
logger.warning_once("using algorithm extension for quantization.")
Expand All @@ -506,6 +541,7 @@ def __init__(
wrapper_autoround(self)
except (ImportError, ModuleNotFoundError):
logger.error("algorithm extension import error, fallback to default mode")
self._post_inited = True

def _gen_auto_scheme(
self, model: torch.nn.Module, scheme: AutoScheme, dataset: str, device_map: Union[str, int, dict, torch.device]
Expand Down Expand Up @@ -612,18 +648,18 @@ def _set_device(self, device_map: Union[str, torch.device, int, dict]) -> None:
raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")

def _parse_and_set_scheme(
self, scheme: Union[str, dict, QuantizationScheme], kwargs
self,
scheme: Union[str, dict, QuantizationScheme],
) -> tuple[QuantizationScheme, bool]:
"""Parse and set the quantization scheme."""

def _parse_and_set(scheme, kwargs):
if kwargs.get("data_type", None) and kwargs["data_type"].endswith("_dq") and not scheme.startswith("gguf"):
if "bits" not in kwargs:
data_type = kwargs["data_type"]
def _parse_and_set(scheme):
if getattr(self, "data_type", None) and self.data_type.endswith("_dq") and not scheme.startswith("gguf"):
if not hasattr(self, "bits") or self.bits is None:
raise KeyError(
f"please set bits when setting data_type={data_type}, or using scheme as an alternative."
f"please set bits when setting data_type={self.data_type}, or using scheme as an alternative."
)
bits = kwargs["bits"]
bits = self.bits
scheme = f"gguf:q{bits}_k" if bits == 6 else f"gguf:q{bits}_k_s"
res = None
if isinstance(scheme, QuantizationScheme):
Expand All @@ -649,11 +685,10 @@ def _parse_and_set(scheme, kwargs):
scheme = asdict(preset_name_to_scheme(scheme))
scheme_keys = [f.name for f in fields(QuantizationScheme)]
for key in scheme_keys:
if key in kwargs and kwargs[key] is not None:
setattr(self, key, kwargs[key])
if hasattr(self, key) and getattr(self, key) is not None:
continue
else:
setattr(self, key, scheme.get(key, None))
# kwargs.pop(key, None)
if self.act_dynamic is None:
self.act_dynamic = True

Expand Down Expand Up @@ -709,7 +744,7 @@ def _parse_and_set(scheme, kwargs):
raise ValueError("options of AutoScheme must not be empty")
options = []
for option in scheme.options:
new_option = _parse_and_set(option, kwargs)
new_option = _parse_and_set(option)
options.append(new_option)
scheme.options = options
for opt in options:
Expand All @@ -721,16 +756,12 @@ def _parse_and_set(scheme, kwargs):
self.scheme = opt # Choose the first one that not 16 bits
break
# apply scheme to set default bits
scheme = _parse_and_set(self.scheme, kwargs)
scheme = _parse_and_set(self.scheme)
is_auto_scheme = True
else:
scheme = _parse_and_set(scheme, kwargs)
scheme = _parse_and_set(scheme)
is_auto_scheme = False

scheme_keys = [f.name for f in fields(QuantizationScheme)]
for key in scheme_keys:
kwargs.pop(key, None)

return scheme, is_auto_scheme

def _adjust_torch_compile(self, enable_torch_compile: bool) -> None:
Expand Down Expand Up @@ -888,6 +919,29 @@ def quantize_and_save(
Raises:
ValueError: If an unsupported format is specified.
"""
# post init
self._post_init()

name_or_path = self.model.name_or_path.rstrip("/")
model_name = name_or_path.split("/")[-1]
if model_name.strip(".") == "" and "gguf" not in format:
if self.group_size <= 0:
suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}"
else:
suffix = f"g{self.group_size}"
export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}")
elif model_name.strip(".") == "" and "gguf" in format:
export_dir = output_dir
elif model_name.strip(".") != "" and "gguf" in format:
export_dir = os.path.join(output_dir, model_name + "-gguf")
else:
if self.group_size <= 0:
suffix = f"afp{self.act_bits}" if "fp" in self.act_data_type else f"a{self.act_bits}"
else:
suffix = f"g{self.group_size}"
export_dir = os.path.join(output_dir, model_name + f"-w{self.bits}{suffix}")

output_dir = export_dir
# Validate and process the specified formats
self.orig_output_dir = output_dir

Expand Down Expand Up @@ -1637,6 +1691,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
Returns:
The quantized model and layer configurations.
"""
# post init
self._post_init()

self._check_compatibility()
formats = self.formats if hasattr(self, "formats") else None
Expand Down Expand Up @@ -2719,6 +2775,7 @@ def quantize_block(
"""

# TODO: release below assertion after supporting MLLM and diffusion model quantization with quantize_block
self._post_init()
assert self.__class__.__name__ not in [
"DiffusionCompressor",
"MLLMCompressor",
Expand Down Expand Up @@ -3141,7 +3198,7 @@ def save_quantized(
output_dir: str = None,
format: Union[str, list[OutputFormat]] = "auto_round",
inplace: bool = True,
return_folders=False,
return_folders=True,
**kwargs,
) -> torch.nn.Module:
"""Save the quantized model to the specified output directory in the specified format.
Expand All @@ -3155,6 +3212,7 @@ def save_quantized(
Returns:
object: The compressed model object.
"""

self.orig_output_dir = output_dir
if isinstance(format, str) and getattr(self, "formats", None) is None:
formats = get_formats(format, self)
Expand Down Expand Up @@ -3194,7 +3252,7 @@ def save_quantized(
folders.append(save_folder)

if return_folders:
return compressed_model, folders
return compressed_model, folders[0] if len(folders) == 1 else folders
else:
return compressed_model

Expand Down
2 changes: 1 addition & 1 deletion auto_round/eval/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_s
return

# Check if evaluation is needed for language models
eval_folder = folders[-1] if folders else None
eval_folder = folders[-1] if folders and isinstance(folders, list) else folders
if args.tasks is None or args.tasks == "" or eval_folder is None:
return

Expand Down
Loading