From 346a339625864a0fc3d634dd669d6b299fd16154 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 2 Dec 2025 15:35:32 +0800
Subject: [PATCH 1/5] add test/prototype/test_parq.py

---
 test/prototype/test_parq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
index 119bfb8d05..8a2dbfdbef 100644
--- a/test/prototype/test_parq.py
+++ b/test/prototype/test_parq.py
@@ -51,7 +51,7 @@
     torch_version_at_least,
 )
 
-_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+_DEVICE = torch.device(torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu")
 
 
 class M(nn.Module):

From fe433decb9ae78e382fe361e300dd9846115509d Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 2 Dec 2025 15:52:27 +0800
Subject: [PATCH 2/5] add test/prototype/test_quantized_training.py

---
 test/prototype/test_quantized_training.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index ab25a38bb3..3a966f4c21 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -34,11 +34,13 @@
     quantize_int8_rowwise,
 )
 from torchao.quantization.quant_api import quantize_
+from torchao.utils import get_current_accelerator_device
 
 if common_utils.SEED is None:
     common_utils.SEED = 1234
 
-_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else [])
+_DEVICE = get_current_accelerator_device()
 
 
 def _reset():
@@ -182,12 +184,12 @@ def test_int8_weight_only_training(self, compile, device):
         ],
     )
     @parametrize("module_swap", [False, True])
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
     def test_int8_mixed_precision_training(self, compile, config, module_swap):
         _reset()
         bsize = 64
         embed_dim = 64
-        device = "cuda"
+        device = _DEVICE
 
         linear = nn.Linear(embed_dim, embed_dim, device=device)
         linear_int8mp = copy.deepcopy(linear)
@@ -221,7 +223,7 @@ def snr(ref, actual):
 
     @pytest.mark.skip("Flaky on CI")
     @parametrize("compile", [False, True])
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
     def test_bitnet_training(self, compile):
         # reference implementation
         # https://github.com/microsoft/unilm/blob/master/bitnet/The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ.pdf
@@ -246,7 +248,7 @@ def forward(self, x):
         _reset()
         bsize = 4
         embed_dim = 32
-        device = "cuda"
+        device = _DEVICE
 
         # only use 1 matmul shape to reduce triton autotune time
         model_ref = nn.Sequential(
@@ -296,7 +298,7 @@ def world_size(self) -> int:
         return _FSDP_WORLD_SIZE
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
     def test_fsdp2_correctness(self):
         mp_policy = MixedPrecisionPolicy()
 
@@ -342,7 +344,7 @@ def _run_subtest(self, args):
             dropout_p=0,
         )
         torch.manual_seed(42)
-        base_model = Transformer(model_args).cuda()
+        base_model = Transformer(model_args).to(_DEVICE)
         fsdp_model = copy.deepcopy(base_model)
 
         quantize_(base_model.layers, quantize_fn)
@@ -362,7 +364,7 @@ def _run_subtest(self, args):
 
         torch.manual_seed(42 + self.rank + 1)
         for iter_idx in range(5):
-            inp = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
+            inp = torch.randint(0, vocab_size, (batch_size, seq_len), device=_DEVICE)
             fsdp_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = fsdp_model(inp).sum()
             fsdp_loss.backward()
@@ -387,14 +389,14 @@ def _run_subtest(self, args):
             )
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
     def test_precompute_bitnet_scale(self):
         from torchao.prototype.quantized_training.bitnet import (
             get_bitnet_scale,
             precompute_bitnet_scale_for_fsdp,
         )
 
-        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).cuda()
+        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to(_DEVICE)
         model_fsdp = copy.deepcopy(model)
         quantize_(model_fsdp, bitnet_training())
         fully_shard(model_fsdp)

From 075315d3723162d810fa5b796f9f51d240a9cad7 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 2 Dec 2025 15:59:51 +0800
Subject: [PATCH 3/5] add test/prototype/test_quantized_training.py

---
 test/prototype/test_parq.py               |  6 +++++-
 test/prototype/test_quantized_training.py | 24 +++++++++++++++++------
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
index 8a2dbfdbef..835bd742de 100644
--- a/test/prototype/test_parq.py
+++ b/test/prototype/test_parq.py
@@ -51,7 +51,11 @@
     torch_version_at_least,
 )
 
-_DEVICE = torch.device(torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu")
+_DEVICE = torch.device(
+    torch.accelerator.current_accelerator().type
+    if torch.accelerator.is_available()
+    else "cpu"
+)
 
 
 class M(nn.Module):
diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index 3a966f4c21..bfe0a9457f 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -39,7 +39,11 @@
 if common_utils.SEED is None:
     common_utils.SEED = 1234
 
-_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else [])
+_DEVICES = (
+    ["cpu"]
+    + (["cuda"] if torch.cuda.is_available() else [])
+    + (["xpu"] if torch.xpu.is_available() else [])
+)
 _DEVICE = get_current_accelerator_device()
 
 
@@ -184,7 +188,9 @@ def test_int8_weight_only_training(self, compile, device):
         ],
     )
     @parametrize("module_swap", [False, True])
-    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
+    @pytest.mark.skipif(
+        not torch.accelerator.is_available(), reason="GPU not available"
+    )
     def test_int8_mixed_precision_training(self, compile, config, module_swap):
         _reset()
         bsize = 64
@@ -223,7 +229,9 @@ def snr(ref, actual):
 
     @pytest.mark.skip("Flaky on CI")
     @parametrize("compile", [False, True])
-    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
+    @pytest.mark.skipif(
+        not torch.accelerator.is_available(), reason="GPU not available"
+    )
     def test_bitnet_training(self, compile):
         # reference implementation
         # https://github.com/microsoft/unilm/blob/master/bitnet/The-Era-of-1-bit-LLMs__Training_Tips_Code_FAQ.pdf
@@ -298,7 +306,7 @@ def world_size(self) -> int:
         return _FSDP_WORLD_SIZE
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
-    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_fsdp2_correctness(self):
         mp_policy = MixedPrecisionPolicy()
 
@@ -389,14 +397,18 @@ def _run_subtest(self, args):
             )
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
-    @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
+    @pytest.mark.skipif(
+        not torch.accelerator.is_available(), reason="GPU not available"
+    )
     def test_precompute_bitnet_scale(self):
         from torchao.prototype.quantized_training.bitnet import (
             get_bitnet_scale,
             precompute_bitnet_scale_for_fsdp,
         )
 
-        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to(_DEVICE)
+        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to(
+            _DEVICE
+        )
         model_fsdp = copy.deepcopy(model)
         quantize_(model_fsdp, bitnet_training())
         fully_shard(model_fsdp)

From 64e431cc847d59b96638138255240f80e3600e75 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Thu, 4 Dec 2025 10:02:46 +0800
Subject: [PATCH 4/5] fix macos-14

---
 test/prototype/test_parq.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/prototype/test_parq.py b/test/prototype/test_parq.py
index 835bd742de..9e802ee92c 100644
--- a/test/prototype/test_parq.py
+++ b/test/prototype/test_parq.py
@@ -51,11 +51,12 @@
     torch_version_at_least,
 )
 
-_DEVICE = torch.device(
-    torch.accelerator.current_accelerator().type
-    if torch.accelerator.is_available()
-    else "cpu"
-)
+if torch.cuda.is_available():
+    _DEVICE = "cuda"
+elif torch.xpu.is_available():
+    _DEVICE = "xpu"
+else:
+    _DEVICE = "cpu"
 
 
 class M(nn.Module):

From 2ddc347c5aae3e32bd87d545572d7a52890dcc61 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Thu, 4 Dec 2025 16:12:13 +0800
Subject: [PATCH 5/5] refine _DEVICES

---
 test/prototype/test_quantized_training.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index bfe0a9457f..d3b14f0f94 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -39,12 +39,8 @@
 if common_utils.SEED is None:
     common_utils.SEED = 1234
 
-_DEVICES = (
-    ["cpu"]
-    + (["cuda"] if torch.cuda.is_available() else [])
-    + (["xpu"] if torch.xpu.is_available() else [])
-)
 _DEVICE = get_current_accelerator_device()
+_DEVICES = ["cpu"] + ([_DEVICE] if torch.accelerator.is_available() else [])
 
 
 def _reset():