Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
| **auto_round** | W4A16(Recommended), W2A16, W3A16, W8A16, W2A16G64, W2A16G32, `MXFP4`, `MXFP8`, `MXFP4_RCEIL`, `MXFP8_RCEIL`, `NVFP4`, `FPW8A16`, `FP8_STATIC`, `BF16` |
| **auto_awq** | W4A16(Recommended), BF16 |
| **auto_gptq** | W4A16(Recommended), W2A16, W3A16, W8A16, W2A16G64, W2A16G32,BF16 |
| **llm_compressor** | NVFP4(Recommended), `MXFP4`, `MXFP8`, `FPW8A16`, `FP8_STATIC` |
| **llm_compressor** | NVFP4(Recommended), `MXFP4`, `MXFP8`, `FPW8A16`, `FP8_STATIC`, `FP8_BLOCK`, `INT8_W8A8`, `W4A16`, `W8A16` |
| **gguf** | GGUF:Q4_K_M(Recommended), GGUF:Q2_K_S, GGUF:Q3_K_S, GGUF:Q3_K_M, GGUF:Q3_K_L, GGUF:Q4_K_S, GGUF:Q5_K_S, GGUF:Q5_K_M, GGUF:Q6_K, GGUF:Q4_0, GGUF:Q4_1, GGUF:Q5_0, GGUF:Q5_1,GGUF:Q8_0 |
| **fake** | `all schemes (only for research)` |
</details>
Expand Down
2 changes: 1 addition & 1 deletion README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
|**auto_round**| W4A16(推荐)、W2A16、W3A16、W8A16、W2A16G64、W2A16G32、`MXFP4`​、`MXFP8`​、`MXFP4_RCEIL`​、`MXFP8_RCEIL`​、`NVFP4`​、`FPW8A16`​、`FP8_STATIC`​、`BF16` |
|**auto_awq**| W4A16(推荐)、BF16 |
|**auto_gptq**| W4A16(推荐)、W2A16、W3A16、W8A16、W2A16G64、W2A16G32、BF16 |
|**llm_compressor**| NVFP4(推荐)、`MXFP4`​、`MXFP8`​、`FPW8A16`​、`FP8_STATIC` |
|**llm_compressor**| NVFP4(推荐)、`MXFP4`​、`MXFP8`​、`FPW8A16`​、`FP8_STATIC`、`FP8_BLOCK`、`INT8_W8A8`、`W4A16`、`W8A16` |
|**gguf**| GGUF:Q4\_K\_M(推荐)、Auto-RoundGGUF:Q2\_K\_S、GGUF:Q3\_K\_S、GGUF:Q3\_K\_M、GGUF:Q3\_K\_L、GGUF:Q4\_K\_S、GGUF:Q5\_K\_S、GGUF:Q5\_K\_M、GGUF:Q6\_K、GGUF:Q4\_0、GGUF:Q4\_1、GGUF:Q5\_0、GGUF:Q5\_1、GGUF:Q8\_0 |
|**fake**| ​`所有方案(仅用于研究)` |
</details>
Expand Down
10 changes: 10 additions & 0 deletions auto_round/compressors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,16 @@ def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool:
return False


def is_wint_woq(ar: Callable) -> bool:
"""Returns True for integer weight-only quantization with non-quantized activations (`act_bits >= 16`)."""
return "int" in ar.data_type and ar.act_bits >= 16 and ar.super_group_size is None


def is_wint_a16(ar: Callable) -> bool:
"""Backward-compatible alias for `is_wint_woq()`."""
return is_wint_woq(ar)


def is_dynamic_wint8aint8(ar_or_format: Union[str, Callable]) -> bool:
if isinstance(ar_or_format, str):
return "int8_w8a8" in ar_or_format.lower()
Expand Down
68 changes: 31 additions & 37 deletions auto_round/export/export_to_llmcompressor/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,18 @@ def construct_ct_scheme(layer):
strategy=_get_weight_scheme_strategy(layer.group_size),
block_structure=layer.group_size if _get_weight_scheme_strategy(layer.group_size) == "block" else None,
)
activations_args = QuantizationArgs(
num_bits=layer.act_bits,
type=_get_scheme_type(layer.act_data_type),
symmetric=layer.act_sym,
dynamic=layer.act_dynamic,
group_size=layer.act_group_size if _get_act_scheme_strategy(layer.act_group_size) == "group" else None,
strategy=_get_act_scheme_strategy(layer.act_group_size),
)
# Weight-only quantization (W4A16, W8A16, etc.): no activation quantization
if layer.act_bits >= 16 or layer.act_data_type is None:
activations_args = None
else:
activations_args = QuantizationArgs(
num_bits=layer.act_bits,
type=_get_scheme_type(layer.act_data_type),
symmetric=layer.act_sym,
dynamic=layer.act_dynamic,
group_size=layer.act_group_size if _get_act_scheme_strategy(layer.act_group_size) == "group" else None,
strategy=_get_act_scheme_strategy(layer.act_group_size),
)
scheme = QuantizationScheme(
targets=[layer.__class__.__name__],
weights=weights_args,
Expand All @@ -98,38 +102,28 @@ def _get_quant_format(model):


def _compress_and_set_format(layer, scheme, device=None):
"""Compress a layer and set its quantization format, with backward compatibility.
"""Compress a layer and set its quantization format.

Supports both old and new compressed_tensors APIs:
- New API: compress_module() handles compression, state dict replacement, status, and format.
- Old API: NaiveQuantizationCompressor instance + manual param management + set_per_module_format.
The new API was introduced after compressed_tensors v0.14.1, may be available in v0.15+.
Compatible with multiple compressed_tensors versions.
"""
try:
from compressed_tensors.compressors import compress_module # pylint: disable=E0401

compress_module(layer)
except (ImportError, TypeError):
from compressed_tensors.compressors import NaiveQuantizationCompressor # pylint: disable=E0401
from compressed_tensors.config.format import set_per_module_format # pylint: disable=E0401
from compressed_tensors.quantization import QuantizationStatus # pylint: disable=E0401
from compressed_tensors.utils import ( # pylint: disable=E0401
delete_offload_parameter,
register_offload_parameter,
)

compressor = NaiveQuantizationCompressor()
q_state_dict = compressor.compress(layer.state_dict(), names_to_scheme={"": scheme}, show_progress=False)

for param_name, _ in list(layer.named_parameters(recurse=False)):
delete_offload_parameter(layer, param_name)

for param_name, value in q_state_dict.items():
param = torch.nn.Parameter(value, requires_grad=False)
register_offload_parameter(layer, param_name, param, device)

layer.quantization_status = QuantizationStatus.COMPRESSED
set_per_module_format(layer)
# Newer compressed_tensors export path
from compressed_tensors.compressors import compress_module as _compress_module # pylint: disable=E0401
except ImportError:
try:
# Older versions expose this from module path only
from compressed_tensors.compressors.base import compress_module as _compress_module # pylint: disable=E0401
except ImportError as e:
logger.error(
"Unable to import compress_module from compressed_tensors "
"(tried compressed_tensors.compressors and "
"compressed_tensors.compressors.base). "
"Please install/upgrade compressed-tensors."
)
raise ImportError(
"compress_module not found in compressed_tensors. " "Install a compatible version."
) from e
_compress_module(layer)


def pack_layer(name, model, device=None):
Expand Down
17 changes: 15 additions & 2 deletions auto_round/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
is_standard_fp,
is_static_wfp8afp8,
is_wfp8afp8,
is_wint_woq,
)
from auto_round.export.export_to_gguf.config import ModelType
from auto_round.schemes import (
Expand Down Expand Up @@ -69,6 +70,7 @@ class AutoRoundExportFormat(str, Enum):
NV_FP4_WITH_STATIC_GS = "nv_fp4_with_static_gs"
INT8_W8A8 = "int8_w8a8"
FP8_BLOCK = "fp8_block"
WINT_A16 = "wint_a16"


if TYPE_CHECKING:
Expand Down Expand Up @@ -345,7 +347,7 @@ def save_quantized(

@OutputFormat.register("llm_compressor")
class LLMCompressorFormat(OutputFormat):
support_schemes = ["MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC", "INT8_W8A8", "FP8_BLOCK"]
support_schemes = ["MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC", "INT8_W8A8", "FP8_BLOCK", "W4A16", "W8A16"]
format_name = "llm_compressor"

def __init__(self, format, ar):
Expand Down Expand Up @@ -384,6 +386,11 @@ def __init__(self, format, ar):

check_compressed_tensors_supported()
self.backend = LLMCompressorFormat(AutoRoundExportFormat.INT8_W8A8.value, ar)
elif is_wint_woq(ar):
from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported

check_compressed_tensors_supported()
self.backend = LLMCompressorFormat(AutoRoundExportFormat.WINT_A16.value, ar)
else:
if format.upper() not in list(AutoRoundExportFormat.__members__.keys()):
raise KeyError(f"Unsupported backend format llm_compressor:{format}, please check")
Expand All @@ -397,7 +404,9 @@ def check_scheme_args(cls: OutputFormat, scheme: QuantizationScheme) -> bool:
error_logs.append(f"bits={scheme.bits}")
if not re.search("mxfp|fp|nvfp|int", scheme.data_type):
error_logs.append(f"data_type={scheme.data_type}")
if scheme.data_type in ["fp", "int"] and scheme.bits != 8:
if scheme.data_type == "fp" and scheme.bits != 8:
error_logs.append(f"data_type={scheme.data_type}, bits={scheme.bits}")
if scheme.data_type == "int" and scheme.bits not in [4, 8]:
error_logs.append(f"data_type={scheme.data_type}, bits={scheme.bits}")
if scheme.super_bits:
error_logs.append(f"super_bits={scheme.super_bits}")
Expand Down Expand Up @@ -470,6 +479,10 @@ def pack_layer(self, layer_name, model, device=None, **kwargs):
elif re.search(f"{AutoRoundExportFormat.FP8_BLOCK.value}", self.output_format):
from auto_round.export.export_to_llmcompressor.export import pack_layer

return pack_layer(layer_name, model, device=device)
elif re.search(f"{AutoRoundExportFormat.WINT_A16.value}", self.output_format):
from auto_round.export.export_to_llmcompressor.export import pack_layer

return pack_layer(layer_name, model, device=device)
## passed as no other llm_compressor format is supported yet
logger.warning("No other llm_compressor packing format(except NVFP&MXFP) is supported yet, skip packing")
Expand Down
58 changes: 51 additions & 7 deletions test/test_cpu/export/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,28 +471,72 @@ def test_autoawq_qwen3_vl_infer(self, dataloader):
"model.visual.blocks" in modules_to_not_convert
), f"'model.visual.blocks' should be in modules_to_not_convert. Got: {modules_to_not_convert}"

def test_llmc_dynamic_wint8aint8_export(self):
@pytest.mark.parametrize(
"iters,use_dataloader",
[
(0, False), # RTN (no tuning)
(1, True), # with tuning
],
ids=["rtn", "tuning"],
)
def test_llmc_dynamic_wint8aint8_export(self, iters, use_dataloader, dataloader):
from safetensors import safe_open

dataset = dataloader if use_dataloader else None
autoround = AutoRound(
self.model_name,
iters=0,
iters=iters,
nsamples=2,
seqlen=2,
dataset=dataset,
scheme="INT8_W8A8",
)
quantized_model_path = self.save_dir
autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f:
assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8

def test_llmc_dynamic_wint8aint8_export_with_tuning(self, dataloader):
shutil.rmtree(quantized_model_path, ignore_errors=True)

@pytest.mark.parametrize(
"scheme,bits,group_size,sym",
[
("W4A16", 4, 128, True),
("W4A16", 4, -1, True),
("W8A16", 8, -1, True),
],
)
def test_llmc_wint_a16_export(self, scheme, bits, group_size, sym):
from safetensors import safe_open

autoround = AutoRound(self.model_name, iters=1, nsamples=2, seqlen=2, dataset=dataloader, scheme="INT8_W8A8")
autoround = AutoRound(
self.model_name,
iters=2,
nsamples=2,
seqlen=2,
scheme=scheme,
Comment thread
thuang6 marked this conversation as resolved.
bits=bits,
group_size=group_size,
sym=sym,
)
quantized_model_path = self.save_dir
autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f:
assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8
# weights must be packed as int32 (compressed-tensors stores both int4 and int8 as torch.int32)
weight = f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight_packed")
assert weight.dtype == torch.int32, f"Expected int32 weight for {scheme}, got {weight.dtype}"
# weight_scale must be present and be a float tensor
scale_key = "model.decoder.layers.8.self_attn.k_proj.weight_scale"
assert scale_key in f.keys(), f"Missing {scale_key} for {scheme} export"
scale = f.get_tensor(scale_key)
assert scale.dtype in (
torch.float32,
torch.float16,
torch.bfloat16,
), f"Expected float weight_scale for {scheme}, got {scale.dtype}"
# No input_scale should be present for weight-only quantization
input_scale_keys = [k for k in f.keys() if k.endswith(".input_scale")]
assert (
len(input_scale_keys) == 0
), f"Expected no input_scale for weight-only {scheme}, but found: {input_scale_keys[:5]}"
shutil.rmtree(quantized_model_path, ignore_errors=True)
Loading