From 346a339625864a0fc3d634dd669d6b299fd16154 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 2 Dec 2025 15:35:32 +0800 Subject: [PATCH 1/5] add test/prototype/test_parq.py --- test/prototype/test_parq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py index 119bfb8d05..8a2dbfdbef 100644 --- a/test/prototype/test_parq.py +++ b/test/prototype/test_parq.py @@ -51,7 +51,7 @@ torch_version_at_least, ) -_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") +_DEVICE = torch.device(torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu") class M(nn.Module): From fe433decb9ae78e382fe361e300dd9846115509d Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 2 Dec 2025 15:52:27 +0800 Subject: [PATCH 2/5] add test/prototype/test_quantized_training.py --- test/prototype/test_quantized_training.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py index ab25a38bb3..3a966f4c21 100644 --- a/test/prototype/test_quantized_training.py +++ b/test/prototype/test_quantized_training.py @@ -34,11 +34,13 @@ quantize_int8_rowwise, ) from torchao.quantization.quant_api import quantize_ +from torchao.utils import get_current_accelerator_device if common_utils.SEED is None: common_utils.SEED = 1234 -_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) +_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else []) +_DEVICE = get_current_accelerator_device() def _reset(): @@ -182,12 +184,12 @@ def test_int8_weight_only_training(self, compile, device): ], ) @parametrize("module_swap", [False, True]) - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_int8_mixed_precision_training(self, compile, config, module_swap): _reset() bsize = 64 embed_dim = 64 - device = "cuda" + device = _DEVICE linear = nn.Linear(embed_dim, embed_dim, device=device) linear_int8mp = copy.deepcopy(linear) @@ -221,7 +223,7 @@ def snr(ref, actual): @pytest.mark.skip("Flaky on CI") @parametrize("compile", [False, True]) - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_bitnet_training(self, compile): # reference implementation # https://github.com/microsoft/unilm/blob/master/bitnet/The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ.pdf @@ -246,7 +248,7 @@ def forward(self, x): _reset() bsize = 4 embed_dim = 32 - device = "cuda" + device = _DEVICE # only use 1 matmul shape to reduce triton autotune time model_ref = nn.Sequential( @@ -296,7 +298,7 @@ def world_size(self) -> int: return _FSDP_WORLD_SIZE @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_fsdp2_correctness(self): mp_policy = MixedPrecisionPolicy() @@ -342,7 +344,7 @@ def _run_subtest(self, args): dropout_p=0, ) torch.manual_seed(42) - base_model = Transformer(model_args).cuda() + base_model = Transformer(model_args).to(_DEVICE) fsdp_model = copy.deepcopy(base_model) quantize_(base_model.layers, quantize_fn) @@ -362,7 +364,7 @@ def _run_subtest(self, args): torch.manual_seed(42 + self.rank + 1) for iter_idx in range(5): - inp = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + inp = torch.randint(0, vocab_size, (batch_size, seq_len), device=_DEVICE) fsdp_optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) fsdp_loss = fsdp_model(inp).sum() fsdp_loss.backward() @@ -387,14 +389,14 @@ def _run_subtest(self, args): ) @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) - @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_precompute_bitnet_scale(self): from torchao.prototype.quantized_training.bitnet import ( get_bitnet_scale, precompute_bitnet_scale_for_fsdp, ) - model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).cuda() + model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to(_DEVICE) model_fsdp = copy.deepcopy(model) quantize_(model_fsdp, bitnet_training()) fully_shard(model_fsdp) From 075315d3723162d810fa5b796f9f51d240a9cad7 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 2 Dec 2025 15:59:51 +0800 Subject: [PATCH 3/5] add test/prototype/test_quantized_training.py --- test/prototype/test_parq.py | 6 +++++- test/prototype/test_quantized_training.py | 24 +++++++++++++++++------ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py index 8a2dbfdbef..835bd742de 100644 --- a/test/prototype/test_parq.py +++ b/test/prototype/test_parq.py @@ -51,7 +51,11 @@ torch_version_at_least, ) -_DEVICE = torch.device(torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu") +_DEVICE = torch.device( + torch.accelerator.current_accelerator().type + if torch.accelerator.is_available() + else "cpu" +) class M(nn.Module): diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py index 3a966f4c21..bfe0a9457f 100644 --- a/test/prototype/test_quantized_training.py +++ b/test/prototype/test_quantized_training.py @@ -39,7 +39,11 @@ if common_utils.SEED is None: common_utils.SEED = 1234 -_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else []) +_DEVICES = ( + ["cpu"] + + (["cuda"] if torch.cuda.is_available() else []) + + (["xpu"] if torch.xpu.is_available() else []) +) _DEVICE = get_current_accelerator_device() @@ -184,7 +188,9 @@ def test_int8_weight_only_training(self, compile, device): ], ) @parametrize("module_swap", [False, True]) - @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") + @pytest.mark.skipif( + not torch.accelerator.is_available(), reason="GPU not available" + ) def test_int8_mixed_precision_training(self, compile, config, module_swap): _reset() bsize = 64 @@ -223,7 +229,9 @@ def snr(ref, actual): @pytest.mark.skip("Flaky on CI") @parametrize("compile", [False, True]) - @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") + @pytest.mark.skipif( + not torch.accelerator.is_available(), reason="GPU not available" + ) def test_bitnet_training(self, compile): # reference implementation # https://github.com/microsoft/unilm/blob/master/bitnet/The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ.pdf @@ -298,7 +306,7 @@ def world_size(self) -> int: return _FSDP_WORLD_SIZE @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) - @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_fsdp2_correctness(self): mp_policy = MixedPrecisionPolicy() @@ -389,14 +397,18 @@ def _run_subtest(self, args): ) @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) - @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") + @pytest.mark.skipif( + not torch.accelerator.is_available(), reason="GPU not available" + ) def test_precompute_bitnet_scale(self): from torchao.prototype.quantized_training.bitnet import ( get_bitnet_scale, precompute_bitnet_scale_for_fsdp, ) - model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to(_DEVICE) + model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to( + _DEVICE + ) model_fsdp = copy.deepcopy(model) quantize_(model_fsdp, bitnet_training()) fully_shard(model_fsdp) From 64e431cc847d59b96638138255240f80e3600e75 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 4 Dec 2025 10:02:46 +0800 Subject: [PATCH 4/5] fix macos-14 --- test/prototype/test_parq.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py index 835bd742de..9e802ee92c 100644 --- a/test/prototype/test_parq.py +++ b/test/prototype/test_parq.py @@ -51,11 +51,12 @@ torch_version_at_least, ) -_DEVICE = torch.device( - torch.accelerator.current_accelerator().type - if torch.accelerator.is_available() - else "cpu" -) +if torch.cuda.is_available(): + _DEVICE = "cuda" +elif torch.xpu.is_available(): + _DEVICE = "xpu" +else: + _DEVICE = "cpu" class M(nn.Module): From 2ddc347c5aae3e32bd87d545572d7a52890dcc61 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 4 Dec 2025 16:12:13 +0800 Subject: [PATCH 5/5] refine _DEVICES --- test/prototype/test_quantized_training.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py index bfe0a9457f..d3b14f0f94 100644 --- a/test/prototype/test_quantized_training.py +++ b/test/prototype/test_quantized_training.py @@ -39,12 +39,8 @@ if common_utils.SEED is None: common_utils.SEED = 1234 -_DEVICES = ( - ["cpu"] - + (["cuda"] if torch.cuda.is_available() else []) - + (["xpu"] if torch.xpu.is_available() else []) -) _DEVICE = get_current_accelerator_device() +_DEVICES = ["cpu"] + ([_DEVICE] if torch.accelerator.is_available() else []) def _reset():