From 8106bb8383ba9c635078c91983c27e554f246d6d Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Thu, 30 Dec 2021 10:35:27 -0800
Subject: [PATCH 01/40] Fast model checkpointing

---
 .../torch_serialize_save_model.py             | 128 ++++++++++++++++++
 .../torch_serialize_save_tensor.py            | 113 ++++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 fast_io/model_checkpoint/torch_serialize_save_model.py
 create mode 100644 fast_io/model_checkpoint/torch_serialize_save_tensor.py

diff --git a/fast_io/model_checkpoint/torch_serialize_save_model.py b/fast_io/model_checkpoint/torch_serialize_save_model.py
new file mode 100644
index 000000000..03d924b1b
--- /dev/null
+++ b/fast_io/model_checkpoint/torch_serialize_save_model.py
@@ -0,0 +1,128 @@
+import time
+import argparse
+import torch
+import os
+from transformers import AutoModelForCausalLM
+from transformers import T5ForConditionalGeneration
+import deepspeed
+from deepspeed.ops.aio import AsyncIOBuilder
+
+
+AIO_QUEUE_DEPTH = 8
+AIO_BLOCK_SIZE = 8*(1024**2)
+AIO_THREAD_COUNT = 1
+AIO_SINGLE_SUBMIT = False
+AIO_OVERLAP_EVENTS = False
+PINNED_BUFFER_MB = 64 
+
+def _get_model(big_model):
+    if big_model: 
+        model_name="EleutherAI/gpt-j-6B"
+        model = AutoModelForCausalLM.from_pretrained(model_name).half()#.cuda()
+        ckpt_name="gpt-j-6B"
+    else:
+        model_name="hf-internal-testing/tiny-random-t5" # "patrickvonplaten/t5-tiny-random" # "t5-small"
+        model = T5ForConditionalGeneration.from_pretrained(model_name).half()
+        ckpt_name="t5-small"
+    
+    return model, model_name, ckpt_name
+
+
+def _get_aio_handle():
+    h = AsyncIOBuilder().load().aio_handle(
+        block_size=AIO_BLOCK_SIZE,
+        queue_depth=AIO_QUEUE_DEPTH,
+        single_submit=AIO_SINGLE_SUBMIT,
+        overlap_events=AIO_SINGLE_SUBMIT,
+        num_threads=AIO_THREAD_COUNT)
+    return h
+
+def test_save(file, buffer, use_zipfile):
+    st = time.time()
+    torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    return time.time() - st
+
+
+def test_ds_mock_save(file, buffer, use_zipfile):
+    from deepspeed.io import MockFileWriter
+    st = time.time()
+    dsmw = MockFileWriter(file)
+    torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    write_sec = time.time() - st
+    dsmw._dump_state()
+    return write_sec 
+
+def test_ds_py_save(file, buffer, use_zipfile):
+    from deepspeed.io import PyFileWriter
+    st = time.time()
+    dspw = PyFileWriter(file)   
+    torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    write_sec = time.time() - st
+    dspw._dump_state()
+    return write_sec 
+
+def test_ds_aio_save(file, buffer, use_zipfile):
+    h = _get_aio_handle()
+    pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
+    from deepspeed.io import DeepSpeedFileWriter as dsfw
+    st = time.time()
+    dsfw = dsfw(
+        file_path=file, 
+        aio_handle=h,
+        pinned_tensor=pinned_memory)
+    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True)
+    write_sec = time.time() - st
+    dsfw._dump_state()
+    return write_sec
+
+def run(model, model_name, ckpt_name, folder):
+    print(f'Model name = {model_name}')
+    fn_dict = {
+        'test_save': test_save, 
+        'test_ds_mock_save': test_ds_mock_save, 
+        'test_ds_py_save': test_ds_py_save,
+        'test_ds_aio_save':test_ds_aio_save
+    }
+    for tag, fn in fn_dict.items():
+        file = os.path.join(folder, f'{tag}_{ckpt_name}.pt')
+        print(f'checkpoint file = {file}')
+        if os.path.isfile(file):
+            os.remove(file)
+        st = time.time()
+        write_sec = fn(file, model, True)
+        ckpt_size = os.path.getsize(file)
+        gb_size = ckpt_size/(1024**3)
+        gb_per_sec = gb_size/write_sec
+        print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s')
+        print(f'*********************************************')
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='Folder to use for I/O.')
+    parser.add_argument('--big_model',
+                        action='store_true',
+                        help='Use EleutherAI/gpt-j-6B for checkpointing.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def main():
+    print(f'Performance test of deepspeed fast model checkpoint')
+    print(f'torch version = {torch.__version__}')
+    torch.manual_seed(42)
+    args = parse_arguments()
+    if not os.path.exists(args.folder):
+        print(f'Invalid folder: {args.folder}')
+        quit()
+    model, model_name, ckpt_name = _get_model(args.big_model)
+    
+    run(model, model_name, ckpt_name, args.folder)
+    
+
+if __name__ == "__main__":
+    main()
diff --git a/fast_io/model_checkpoint/torch_serialize_save_tensor.py b/fast_io/model_checkpoint/torch_serialize_save_tensor.py
new file mode 100644
index 000000000..85f99eddf
--- /dev/null
+++ b/fast_io/model_checkpoint/torch_serialize_save_tensor.py
@@ -0,0 +1,113 @@
+import time
+import argparse
+import torch
+import os
+import deepspeed
+from deepspeed.ops.aio import AsyncIOBuilder
+
+
+AIO_QUEUE_DEPTH = 8
+AIO_BLOCK_SIZE = 8*(1024**2)
+AIO_THREAD_COUNT = 1
+AIO_SINGLE_SUBMIT = False
+AIO_OVERLAP_EVENTS = False
+PINNED_BUFFER_MB = 64 
+
+
+def _get_aio_handle():
+    h = AsyncIOBuilder().load().aio_handle(
+        block_size=AIO_BLOCK_SIZE,
+        queue_depth=AIO_QUEUE_DEPTH,
+        single_submit=AIO_SINGLE_SUBMIT,
+        overlap_events=AIO_SINGLE_SUBMIT,
+        num_threads=AIO_THREAD_COUNT)
+    return h
+
+def test_save(file, buffer, use_zipfile):
+    st = time.time()
+    torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    return time.time() - st
+
+
+def test_ds_mock_save(file, buffer, use_zipfile):
+    from deepspeed.io import MockFileWriter
+    st = time.time()
+    dsmw = MockFileWriter(file)
+    torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    write_sec = time.time() - st
+    dsmw._dump_state()
+    return write_sec 
+
+def test_ds_py_save(file, buffer, use_zipfile):
+    from deepspeed.io import PyFileWriter
+    st = time.time()
+    dspw = PyFileWriter(file)   
+    torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    write_sec = time.time() - st
+    dspw._dump_state()
+    return write_sec 
+
+def test_ds_aio_save(file, buffer, use_zipfile):
+    h = _get_aio_handle()
+    pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
+    from deepspeed.io import DeepSpeedFileWriter as dsfw
+    st = time.time()
+    dsfw = dsfw(
+        file_path=file, 
+        aio_handle=h,
+        pinned_tensor=pinned_memory)
+    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True)
+    write_sec = time.time() - st
+    dsfw._dump_state()
+    return write_sec
+
+def run(mb_size, folder):
+    buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu').pin_memory()
+
+    fn_dict = {
+        'test_save': test_save, 
+        'test_ds_mock_save': test_ds_mock_save, 
+        'test_ds_py_save': test_ds_py_save,
+        'test_ds_aio_save':test_ds_aio_save
+    }
+    for tag, fn in fn_dict.items():
+        file = os.path.join(folder, f'{tag}_{mb_size}MB.pt')
+        print(f'checkpoint file = {file}')
+        if os.path.isfile(file):
+            os.remove(file)
+        st = time.time()
+        write_sec = fn(file, buffer, True)
+        gb_per_sec = mb_size/(1024.0*write_sec)
+        gb_size = os.path.getsize(file)/(1024**3)
+        print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s')
+        print(f'*********************************************')
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='Folder to use for I/O.')
+    parser.add_argument('--mb_size',
+                        type=int,
+                        default=None,
+                        required=True,
+                        help='Size of tensor to save in MB.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+
+def main():
+    print(f'Performance test of deepspeed fast checkpoint')
+    args = parse_arguments()
+    if not os.path.exists(args.folder):
+        print(f'Invalid folder: {args.folder}')
+        quit()
+    run(args.mb_size, args.folder)
+    
+
+if __name__ == "__main__":
+    main()

From 761e4e54d6bb317d0068e66347d70bffaa7f01cf Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Fri, 31 Dec 2021 10:42:54 -0800
Subject: [PATCH 02/40] Support both legacy and serialized formats

---
 ...h_serialize_save_model.py => torch_save_model.py} | 12 ++++++++----
 ...serialize_save_tensor.py => torch_save_tensor.py} | 12 ++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)
 rename fast_io/model_checkpoint/{torch_serialize_save_model.py => torch_save_model.py} (92%)
 rename fast_io/model_checkpoint/{torch_serialize_save_tensor.py => torch_save_tensor.py} (92%)

diff --git a/fast_io/model_checkpoint/torch_serialize_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
similarity index 92%
rename from fast_io/model_checkpoint/torch_serialize_save_model.py
rename to fast_io/model_checkpoint/torch_save_model.py
index 03d924b1b..0b0a5d4d2 100644
--- a/fast_io/model_checkpoint/torch_serialize_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -70,12 +70,12 @@ def test_ds_aio_save(file, buffer, use_zipfile):
         file_path=file, 
         aio_handle=h,
         pinned_tensor=pinned_memory)
-    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True)
+    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
     write_sec = time.time() - st
     dsfw._dump_state()
     return write_sec
 
-def run(model, model_name, ckpt_name, folder):
+def run(model, model_name, ckpt_name, folder, legacy_save):
     print(f'Model name = {model_name}')
     fn_dict = {
         'test_save': test_save, 
@@ -89,7 +89,7 @@ def run(model, model_name, ckpt_name, folder):
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, model, True)
+        write_sec = fn(file, model, not legacy_save)
         ckpt_size = os.path.getsize(file)
         gb_size = ckpt_size/(1024**3)
         gb_per_sec = gb_size/write_sec
@@ -106,6 +106,10 @@ def parse_arguments():
     parser.add_argument('--big_model',
                         action='store_true',
                         help='Use EleutherAI/gpt-j-6B for checkpointing.')
+    parser.add_argument('--legacy',
+                        action='store_true',
+                        help='Use torch legacy save format')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args
@@ -121,7 +125,7 @@ def main():
         quit()
     model, model_name, ckpt_name = _get_model(args.big_model)
     
-    run(model, model_name, ckpt_name, args.folder)
+    run(model, model_name, ckpt_name, args.folder, args.legacy)
     
 
 if __name__ == "__main__":
diff --git a/fast_io/model_checkpoint/torch_serialize_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
similarity index 92%
rename from fast_io/model_checkpoint/torch_serialize_save_tensor.py
rename to fast_io/model_checkpoint/torch_save_tensor.py
index 85f99eddf..cc601bb3e 100644
--- a/fast_io/model_checkpoint/torch_serialize_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -56,12 +56,12 @@ def test_ds_aio_save(file, buffer, use_zipfile):
         file_path=file, 
         aio_handle=h,
         pinned_tensor=pinned_memory)
-    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True)
+    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
     write_sec = time.time() - st
     dsfw._dump_state()
     return write_sec
 
-def run(mb_size, folder):
+def run(mb_size, folder, legacy_save):
     buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu').pin_memory()
 
     fn_dict = {
@@ -76,7 +76,7 @@ def run(mb_size, folder):
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, buffer, True)
+        write_sec = fn(file, buffer, not legacy_save)
         gb_per_sec = mb_size/(1024.0*write_sec)
         gb_size = os.path.getsize(file)/(1024**3)
         print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s')
@@ -94,6 +94,10 @@ def parse_arguments():
                         default=None,
                         required=True,
                         help='Size of tensor to save in MB.')
+    parser.add_argument('--legacy',
+                        action='store_true',
+                        help='Use torch legacy save format')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args
@@ -106,7 +110,7 @@ def main():
     if not os.path.exists(args.folder):
         print(f'Invalid folder: {args.folder}')
         quit()
-    run(args.mb_size, args.folder)
+    run(args.mb_size, args.folder, args.legacy)
     
 
 if __name__ == "__main__":

From 5967c7986e9e9ccb05e75d3fdb53110137755933 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 3 Jan 2022 12:19:59 +0000
Subject: [PATCH 03/40] Add io_buffer_mb option

---
 fast_io/model_checkpoint/torch_save_model.py  | 22 +++++++++++-------
 fast_io/model_checkpoint/torch_save_tensor.py | 23 +++++++++++--------
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index 0b0a5d4d2..150a0139c 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -37,13 +37,13 @@ def _get_aio_handle():
         num_threads=AIO_THREAD_COUNT)
     return h
 
-def test_save(file, buffer, use_zipfile):
+def test_save(file, buffer, use_zipfile, io_buffer_mb):
     st = time.time()
     torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
     return time.time() - st
 
 
-def test_ds_mock_save(file, buffer, use_zipfile):
+def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
     from deepspeed.io import MockFileWriter
     st = time.time()
     dsmw = MockFileWriter(file)
@@ -52,7 +52,7 @@ def test_ds_mock_save(file, buffer, use_zipfile):
     dsmw._dump_state()
     return write_sec 
 
-def test_ds_py_save(file, buffer, use_zipfile):
+def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
     from deepspeed.io import PyFileWriter
     st = time.time()
     dspw = PyFileWriter(file)   
@@ -61,9 +61,9 @@ def test_ds_py_save(file, buffer, use_zipfile):
     dspw._dump_state()
     return write_sec 
 
-def test_ds_aio_save(file, buffer, use_zipfile):
+def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb):
     h = _get_aio_handle()
-    pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
+    pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
     from deepspeed.io import DeepSpeedFileWriter as dsfw
     st = time.time()
     dsfw = dsfw(
@@ -75,7 +75,7 @@ def test_ds_aio_save(file, buffer, use_zipfile):
     dsfw._dump_state()
     return write_sec
 
-def run(model, model_name, ckpt_name, folder, legacy_save):
+def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
     print(f'Model name = {model_name}')
     fn_dict = {
         'test_save': test_save, 
@@ -89,7 +89,7 @@ def run(model, model_name, ckpt_name, folder, legacy_save):
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, model, not legacy_save)
+        write_sec = fn(file, model, not legacy_save, io_buffer_mb)
         ckpt_size = os.path.getsize(file)
         gb_size = ckpt_size/(1024**3)
         gb_per_sec = gb_size/write_sec
@@ -109,6 +109,12 @@ def parse_arguments():
     parser.add_argument('--legacy',
                         action='store_true',
                         help='Use torch legacy save format')
+    
+    parser.add_argument('--io_buffer_mb',
+                        type=int,
+                        default=PINNED_BUFFER_MB,
+                        required=True,
+                        help='Size of pinned i/o buffer in MB.')
 
     args = parser.parse_args()
     print(f'args = {args}')
@@ -125,7 +131,7 @@ def main():
         quit()
     model, model_name, ckpt_name = _get_model(args.big_model)
     
-    run(model, model_name, ckpt_name, args.folder, args.legacy)
+    run(model, model_name, ckpt_name, args.folder, args.legacy, args.io_buffer_mb)
     
 
 if __name__ == "__main__":
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index cc601bb3e..80ac99197 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -13,7 +13,6 @@
 AIO_OVERLAP_EVENTS = False
 PINNED_BUFFER_MB = 64 
 
-
 def _get_aio_handle():
     h = AsyncIOBuilder().load().aio_handle(
         block_size=AIO_BLOCK_SIZE,
@@ -23,7 +22,7 @@ def _get_aio_handle():
         num_threads=AIO_THREAD_COUNT)
     return h
 
-def test_save(file, buffer, use_zipfile):
+def test_save(file, buffer, use_zipfile, io_buffer_mb):
     st = time.time()
     torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
     return time.time() - st
@@ -38,7 +37,7 @@ def test_ds_mock_save(file, buffer, use_zipfile):
     dsmw._dump_state()
     return write_sec 
 
-def test_ds_py_save(file, buffer, use_zipfile):
+def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
     from deepspeed.io import PyFileWriter
     st = time.time()
     dspw = PyFileWriter(file)   
@@ -47,9 +46,9 @@ def test_ds_py_save(file, buffer, use_zipfile):
     dspw._dump_state()
     return write_sec 
 
-def test_ds_aio_save(file, buffer, use_zipfile):
+def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb):
     h = _get_aio_handle()
-    pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
+    pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
     from deepspeed.io import DeepSpeedFileWriter as dsfw
     st = time.time()
     dsfw = dsfw(
@@ -61,8 +60,8 @@ def test_ds_aio_save(file, buffer, use_zipfile):
     dsfw._dump_state()
     return write_sec
 
-def run(mb_size, folder, legacy_save):
-    buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu').pin_memory()
+def run(mb_size, folder, legacy_save, io_buffer_mb):
+    buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu') # .pin_memory() 
 
     fn_dict = {
         'test_save': test_save, 
@@ -76,7 +75,7 @@ def run(mb_size, folder, legacy_save):
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, buffer, not legacy_save)
+        write_sec = fn(file, buffer, not legacy_save, io_buffer_mb)
         gb_per_sec = mb_size/(1024.0*write_sec)
         gb_size = os.path.getsize(file)/(1024**3)
         print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s')
@@ -98,6 +97,12 @@ def parse_arguments():
                         action='store_true',
                         help='Use torch legacy save format')
 
+    parser.add_argument('--io_buffer_mb',
+                        type=int,
+                        default=PINNED_BUFFER_MB,
+                        required=True,
+                        help='Size of pinned i/o buffer in MB.')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args
@@ -110,7 +115,7 @@ def main():
     if not os.path.exists(args.folder):
         print(f'Invalid folder: {args.folder}')
         quit()
-    run(args.mb_size, args.folder, args.legacy)
+    run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb)
     
 
 if __name__ == "__main__":

From d96f1f6136603a492d700b043870f82518f2ed7e Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 3 Jan 2022 12:43:54 +0000
Subject: [PATCH 04/40] Bug fix

---
 fast_io/model_checkpoint/torch_save_tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 80ac99197..312cf31b8 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -28,7 +28,7 @@ def test_save(file, buffer, use_zipfile, io_buffer_mb):
     return time.time() - st
 
 
-def test_ds_mock_save(file, buffer, use_zipfile):
+def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
     from deepspeed.io import MockFileWriter
     st = time.time()
     dsmw = MockFileWriter(file)

From bbd96f2fa31157a52777a5c3fc078f60580e5513 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 3 Jan 2022 13:18:34 +0000
Subject: [PATCH 05/40] Force flush

---
 fast_io/model_checkpoint/torch_save_model.py  | 1 +
 fast_io/model_checkpoint/torch_save_tensor.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index 150a0139c..a14a933a9 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -71,6 +71,7 @@ def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb):
         aio_handle=h,
         pinned_tensor=pinned_memory)
     torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    dsfw.close() # Force flush to storage
     write_sec = time.time() - st
     dsfw._dump_state()
     return write_sec
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 312cf31b8..20a9ea75b 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -56,12 +56,13 @@ def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb):
         aio_handle=h,
         pinned_tensor=pinned_memory)
     torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    dsfw.close() # Force flush to storage
     write_sec = time.time() - st
     dsfw._dump_state()
     return write_sec
 
 def run(mb_size, folder, legacy_save, io_buffer_mb):
-    buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu') # .pin_memory() 
+    buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu')
 
     fn_dict = {
         'test_save': test_save, 

From 3a161270475044d1d0ba5af10e7b4381063288ff Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 4 Jan 2022 23:04:49 +0000
Subject: [PATCH 06/40] More model options; Refactor common codes

---
 fast_io/model_checkpoint/torch_save_model.py  | 170 ++++++++++--------
 fast_io/model_checkpoint/torch_save_tensor.py |  65 +------
 fast_io/model_checkpoint/torch_save_utils.py  |  61 +++++++
 3 files changed, 157 insertions(+), 139 deletions(-)
 create mode 100644 fast_io/model_checkpoint/torch_save_utils.py

diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index a14a933a9..04795fa4d 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -1,80 +1,44 @@
 import time
 import argparse
 import torch
+from torch.optim import Adam
 import os
 from transformers import AutoModelForCausalLM
 from transformers import T5ForConditionalGeneration
-import deepspeed
-from deepspeed.ops.aio import AsyncIOBuilder
-
-
-AIO_QUEUE_DEPTH = 8
-AIO_BLOCK_SIZE = 8*(1024**2)
-AIO_THREAD_COUNT = 1
-AIO_SINGLE_SUBMIT = False
-AIO_OVERLAP_EVENTS = False
-PINNED_BUFFER_MB = 64 
-
-def _get_model(big_model):
-    if big_model: 
-        model_name="EleutherAI/gpt-j-6B"
-        model = AutoModelForCausalLM.from_pretrained(model_name).half()#.cuda()
-        ckpt_name="gpt-j-6B"
-    else:
-        model_name="hf-internal-testing/tiny-random-t5" # "patrickvonplaten/t5-tiny-random" # "t5-small"
-        model = T5ForConditionalGeneration.from_pretrained(model_name).half()
-        ckpt_name="t5-small"
-    
+from torch_save_utils import PINNED_BUFFER_MB
+from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
+
+
+def _get_gpt_j_6B(tag):
+    model_name="EleutherAI/gpt-j-6B"
+    model = AutoModelForCausalLM.from_pretrained(model_name)#.half()
+    ckpt_name="gpt-j-6B"
     return model, model_name, ckpt_name
 
+def _get_tiny_t5(tag):
+    model_name="hf-internal-testing/tiny-random-t5" 
+    model = T5ForConditionalGeneration.from_pretrained(model_name)#.half()
+    ckpt_name="tiny-random-t5"
+    return model, model_name, ckpt_name
+
+
+def _get_hf_gpt2(tag):
+    model_name = tag
+    model = AutoModelForCausalLM.from_pretrained(tag)
+    ckpt_name = tag
+    return model, model_name, ckpt_name    
+
+
+HF_MODELS = {
+    'tiny-random-t5': _get_tiny_t5,
+    'gpt-j-6B': _get_gpt_j_6B,
+    'gpt2': _get_hf_gpt2,
+    'gpt2-large':  _get_hf_gpt2,
+    'gpt2-xl':  _get_hf_gpt2,
+}
 
-def _get_aio_handle():
-    h = AsyncIOBuilder().load().aio_handle(
-        block_size=AIO_BLOCK_SIZE,
-        queue_depth=AIO_QUEUE_DEPTH,
-        single_submit=AIO_SINGLE_SUBMIT,
-        overlap_events=AIO_SINGLE_SUBMIT,
-        num_threads=AIO_THREAD_COUNT)
-    return h
-
-def test_save(file, buffer, use_zipfile, io_buffer_mb):
-    st = time.time()
-    torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    return time.time() - st
-
-
-def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
-    from deepspeed.io import MockFileWriter
-    st = time.time()
-    dsmw = MockFileWriter(file)
-    torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    write_sec = time.time() - st
-    dsmw._dump_state()
-    return write_sec 
-
-def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
-    from deepspeed.io import PyFileWriter
-    st = time.time()
-    dspw = PyFileWriter(file)   
-    torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    write_sec = time.time() - st
-    dspw._dump_state()
-    return write_sec 
-
-def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb):
-    h = _get_aio_handle()
-    pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
-    from deepspeed.io import DeepSpeedFileWriter as dsfw
-    st = time.time()
-    dsfw = dsfw(
-        file_path=file, 
-        aio_handle=h,
-        pinned_tensor=pinned_memory)
-    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    dsfw.close() # Force flush to storage
-    write_sec = time.time() - st
-    dsfw._dump_state()
-    return write_sec
+def _get_model(model_tag):
+    return HF_MODELS[model_tag](model_tag)
 
 def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
     print(f'Model name = {model_name}')
@@ -82,7 +46,7 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
         'test_save': test_save, 
         'test_ds_mock_save': test_ds_mock_save, 
         'test_ds_py_save': test_ds_py_save,
-        'test_ds_aio_save':test_ds_aio_save
+        'test_ds_fast_save':test_ds_fast_save
     }
     for tag, fn in fn_dict.items():
         file = os.path.join(folder, f'{tag}_{ckpt_name}.pt')
@@ -97,6 +61,20 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
         print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s')
         print(f'*********************************************')
 
+
+def _get_initialized_optimizer(model, fused_opt):
+    base_optimizer = Adam(model.parameters())
+    import deepspeed
+    if fused_opt: 
+        from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper
+    else:
+        from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper 
+    optimizer = FP16_Wrapper(base_optimizer)
+    for p in model.parameters():
+        p.grad = torch.zeros_like(p)
+    optimizer.step()
+    return optimizer
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--folder',
@@ -104,17 +82,29 @@ def parse_arguments():
                         type=str,
                         required=True,
                         help='Folder to use for I/O.')
-    parser.add_argument('--big_model',
-                        action='store_true',
-                        help='Use EleutherAI/gpt-j-6B for checkpointing.')
+    
+    parser.add_argument('--model',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='Hugging Face transformers tag of model (e.g., gpt2).')
+
     parser.add_argument('--legacy',
                         action='store_true',
                         help='Use torch legacy save format')
     
+    parser.add_argument('--optimizer',
+                        action='store_true',
+                        help='Include optimizer state in checkpoint.')
+
+
+    parser.add_argument('--fused',
+                        action='store_true',
+                        help='Use fused fp16 optimizer.')
+
     parser.add_argument('--io_buffer_mb',
                         type=int,
                         default=PINNED_BUFFER_MB,
-                        required=True,
                         help='Size of pinned i/o buffer in MB.')
 
     args = parser.parse_args()
@@ -122,17 +112,41 @@ def parse_arguments():
     return args
 
 
+def validate_arguments(args):
+    success = True
+    if not os.path.exists(args.folder):
+        print(f'Invalid folder: {args.folder}')
+        success = False 
+
+    if not args.model in HF_MODELS:
+        print(f'{args.model} is not a supported HF model tag')
+        success = False 
+
+    return success
+
 def main():
     print(f'Performance test of deepspeed fast model checkpoint')
     print(f'torch version = {torch.__version__}')
     torch.manual_seed(42)
+
     args = parse_arguments()
-    if not os.path.exists(args.folder):
-        print(f'Invalid folder: {args.folder}')
+    if not validate_arguments(args):
         quit()
-    model, model_name, ckpt_name = _get_model(args.big_model)
-    
-    run(model, model_name, ckpt_name, args.folder, args.legacy, args.io_buffer_mb)
+
+    model, model_name, ckpt_name = _get_model(args.model)
+    if args.optimizer:
+        model = model.half().cuda()    
+        optimizer = _get_initialized_optimizer(model, args.fused)
+        ckpt_state = {'model': model, 'optimizer': optimizer}
+    else:
+        ckpt_state = {'model': model}
+    run(
+        ckpt_state, 
+        model_name, 
+        ckpt_name, 
+        args.folder, 
+        args.legacy, 
+        args.io_buffer_mb)
     
 
 if __name__ == "__main__":
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 20a9ea75b..8edea0612 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -2,64 +2,8 @@
 import argparse
 import torch
 import os
-import deepspeed
-from deepspeed.ops.aio import AsyncIOBuilder
-
-
-AIO_QUEUE_DEPTH = 8
-AIO_BLOCK_SIZE = 8*(1024**2)
-AIO_THREAD_COUNT = 1
-AIO_SINGLE_SUBMIT = False
-AIO_OVERLAP_EVENTS = False
-PINNED_BUFFER_MB = 64 
-
-def _get_aio_handle():
-    h = AsyncIOBuilder().load().aio_handle(
-        block_size=AIO_BLOCK_SIZE,
-        queue_depth=AIO_QUEUE_DEPTH,
-        single_submit=AIO_SINGLE_SUBMIT,
-        overlap_events=AIO_SINGLE_SUBMIT,
-        num_threads=AIO_THREAD_COUNT)
-    return h
-
-def test_save(file, buffer, use_zipfile, io_buffer_mb):
-    st = time.time()
-    torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    return time.time() - st
-
-
-def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
-    from deepspeed.io import MockFileWriter
-    st = time.time()
-    dsmw = MockFileWriter(file)
-    torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    write_sec = time.time() - st
-    dsmw._dump_state()
-    return write_sec 
-
-def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
-    from deepspeed.io import PyFileWriter
-    st = time.time()
-    dspw = PyFileWriter(file)   
-    torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    write_sec = time.time() - st
-    dspw._dump_state()
-    return write_sec 
-
-def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb):
-    h = _get_aio_handle()
-    pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
-    from deepspeed.io import DeepSpeedFileWriter as dsfw
-    st = time.time()
-    dsfw = dsfw(
-        file_path=file, 
-        aio_handle=h,
-        pinned_tensor=pinned_memory)
-    torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    dsfw.close() # Force flush to storage
-    write_sec = time.time() - st
-    dsfw._dump_state()
-    return write_sec
+from torch_save_utils import PINNED_BUFFER_MB
+from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
 
 def run(mb_size, folder, legacy_save, io_buffer_mb):
     buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu')
@@ -68,7 +12,7 @@ def run(mb_size, folder, legacy_save, io_buffer_mb):
         'test_save': test_save, 
         'test_ds_mock_save': test_ds_mock_save, 
         'test_ds_py_save': test_ds_py_save,
-        'test_ds_aio_save':test_ds_aio_save
+        'test_ds_fast_save':test_ds_fast_save
     }
     for tag, fn in fn_dict.items():
         file = os.path.join(folder, f'{tag}_{mb_size}MB.pt')
@@ -101,7 +45,6 @@ def parse_arguments():
     parser.add_argument('--io_buffer_mb',
                         type=int,
                         default=PINNED_BUFFER_MB,
-                        required=True,
                         help='Size of pinned i/o buffer in MB.')
 
     args = parser.parse_args()
@@ -111,7 +54,7 @@ def parse_arguments():
 
 
 def main():
-    print(f'Performance test of deepspeed fast checkpoint')
+    print(f'Performance test of deepspeed fast tensor checkpoint')
     args = parse_arguments()
     if not os.path.exists(args.folder):
         print(f'Invalid folder: {args.folder}')
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
new file mode 100644
index 000000000..f75499f0c
--- /dev/null
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -0,0 +1,61 @@
+import time
+import torch
+import os
+import deepspeed
+from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter
+
+
+AIO_QUEUE_DEPTH = 8
+AIO_BLOCK_SIZE = 8*(1024**2)
+AIO_THREAD_COUNT = 1
+AIO_SINGLE_SUBMIT = False
+AIO_OVERLAP_EVENTS = False
+PINNED_BUFFER_MB = 64 
+
+def _get_aio_handle():
+    h = AsyncIOBuilder().load().aio_handle(
+        block_size=AIO_BLOCK_SIZE,
+        queue_depth=AIO_QUEUE_DEPTH,
+        single_submit=AIO_SINGLE_SUBMIT,
+        overlap_events=AIO_SINGLE_SUBMIT,
+        num_threads=AIO_THREAD_COUNT)
+    return h
+
+def test_save(file, buffer, use_zipfile, io_buffer_mb):
+    st = time.time()
+    torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    return time.time() - st
+
+
+def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
+    st = time.time()
+    ds_mock_writer = MockFileWriter(file)
+    torch.save(f=ds_mock_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    ds_mock_writer.close() # Force flush to storage    
+    write_sec = time.time() - st
+    ds_mock_writer._dump_state()
+    return write_sec 
+
+def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
+    st = time.time()
+    ds_py_writer = PyFileWriter(file)   
+    torch.save(f=ds_py_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    ds_py_writer.close() # Force flush to storage
+    write_sec = time.time() - st
+    ds_py_writer._dump_state()
+    return write_sec 
+
+def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb):
+    h = _get_aio_handle()
+    pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
+    st = time.time()
+    ds_fast_writer = FastFileWriter(
+        file_path=file, 
+        aio_handle=h,
+        pinned_tensor=pinned_memory)
+    torch.save(f=ds_fast_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    ds_fast_writer.close() # Force flush to storage
+    write_sec = time.time() - st
+    ds_fast_writer._dump_state()
+    return write_sec
\ No newline at end of file

From c3df4955ba214deb95713f7e3f44b0675374a60c Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Tue, 4 Jan 2022 18:27:30 -0800
Subject: [PATCH 07/40] --gpu option

---
 fast_io/model_checkpoint/torch_save_model.py  | 77 ++++++++++---------
 fast_io/model_checkpoint/torch_save_tensor.py | 32 +++++---
 fast_io/model_checkpoint/torch_save_utils.py  | 57 ++++++++------
 3 files changed, 96 insertions(+), 70 deletions(-)

diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index 04795fa4d..d38f5af67 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -10,15 +10,16 @@
 
 
 def _get_gpt_j_6B(tag):
-    model_name="EleutherAI/gpt-j-6B"
-    model = AutoModelForCausalLM.from_pretrained(model_name)#.half()
-    ckpt_name="gpt-j-6B"
+    model_name = "EleutherAI/gpt-j-6B"
+    model = AutoModelForCausalLM.from_pretrained(model_name)  #.half()
+    ckpt_name = "gpt-j-6B"
     return model, model_name, ckpt_name
 
+
 def _get_tiny_t5(tag):
-    model_name="hf-internal-testing/tiny-random-t5" 
-    model = T5ForConditionalGeneration.from_pretrained(model_name)#.half()
-    ckpt_name="tiny-random-t5"
+    model_name = "hf-internal-testing/tiny-random-t5"
+    model = T5ForConditionalGeneration.from_pretrained(model_name)  #.half()
+    ckpt_name = "tiny-random-t5"
     return model, model_name, ckpt_name
 
 
@@ -26,27 +27,29 @@ def _get_hf_gpt2(tag):
     model_name = tag
     model = AutoModelForCausalLM.from_pretrained(tag)
     ckpt_name = tag
-    return model, model_name, ckpt_name    
+    return model, model_name, ckpt_name
 
 
 HF_MODELS = {
     'tiny-random-t5': _get_tiny_t5,
     'gpt-j-6B': _get_gpt_j_6B,
     'gpt2': _get_hf_gpt2,
-    'gpt2-large':  _get_hf_gpt2,
-    'gpt2-xl':  _get_hf_gpt2,
+    'gpt2-large': _get_hf_gpt2,
+    'gpt2-xl': _get_hf_gpt2,
 }
 
+
 def _get_model(model_tag):
     return HF_MODELS[model_tag](model_tag)
 
+
 def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
     print(f'Model name = {model_name}')
     fn_dict = {
-        'test_save': test_save, 
-        'test_ds_mock_save': test_ds_mock_save, 
+        'test_save': test_save,
+        'test_ds_mock_save': test_ds_mock_save,
         'test_ds_py_save': test_ds_py_save,
-        'test_ds_fast_save':test_ds_fast_save
+        'test_ds_fast_save': test_ds_fast_save
     }
     for tag, fn in fn_dict.items():
         file = os.path.join(folder, f'{tag}_{ckpt_name}.pt')
@@ -56,25 +59,28 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
         st = time.time()
         write_sec = fn(file, model, not legacy_save, io_buffer_mb)
         ckpt_size = os.path.getsize(file)
-        gb_size = ckpt_size/(1024**3)
-        gb_per_sec = gb_size/write_sec
-        print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s')
+        gb_size = ckpt_size / (1024**3)
+        gb_per_sec = gb_size / write_sec
+        print(
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s'
+        )
         print(f'*********************************************')
 
 
 def _get_initialized_optimizer(model, fused_opt):
     base_optimizer = Adam(model.parameters())
     import deepspeed
-    if fused_opt: 
+    if fused_opt:
         from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper
     else:
-        from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper 
+        from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper
     optimizer = FP16_Wrapper(base_optimizer)
     for p in model.parameters():
         p.grad = torch.zeros_like(p)
     optimizer.step()
     return optimizer
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--folder',
@@ -82,26 +88,28 @@ def parse_arguments():
                         type=str,
                         required=True,
                         help='Folder to use for I/O.')
-    
-    parser.add_argument('--model',
-                        default=None,
-                        type=str,
-                        required=True,
-                        help='Hugging Face transformers tag of model (e.g., gpt2).')
+
+    parser.add_argument(
+        '--model',
+        default=None,
+        type=str,
+        required=True,
+        help='Hugging Face transformers tag of model (e.g., gpt2).')
 
     parser.add_argument('--legacy',
                         action='store_true',
                         help='Use torch legacy save format')
-    
+
     parser.add_argument('--optimizer',
                         action='store_true',
                         help='Include optimizer state in checkpoint.')
 
-
     parser.add_argument('--fused',
                         action='store_true',
                         help='Use fused fp16 optimizer.')
 
+    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
+
     parser.add_argument('--io_buffer_mb',
                         type=int,
                         default=PINNED_BUFFER_MB,
@@ -116,14 +124,15 @@ def validate_arguments(args):
     success = True
     if not os.path.exists(args.folder):
         print(f'Invalid folder: {args.folder}')
-        success = False 
+        success = False
 
     if not args.model in HF_MODELS:
         print(f'{args.model} is not a supported HF model tag')
-        success = False 
+        success = False
 
     return success
 
+
 def main():
     print(f'Performance test of deepspeed fast model checkpoint')
     print(f'torch version = {torch.__version__}')
@@ -135,19 +144,17 @@ def main():
 
     model, model_name, ckpt_name = _get_model(args.model)
     if args.optimizer:
-        model = model.half().cuda()    
+        model = model.half().cuda()
         optimizer = _get_initialized_optimizer(model, args.fused)
         ckpt_state = {'model': model, 'optimizer': optimizer}
     else:
+        model = model.half()
+        if args.gpu:
+            model = model.cuda()
         ckpt_state = {'model': model}
-    run(
-        ckpt_state, 
-        model_name, 
-        ckpt_name, 
-        args.folder, 
-        args.legacy, 
+    run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy,
         args.io_buffer_mb)
-    
+
 
 if __name__ == "__main__":
     main()
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 8edea0612..749c6ea31 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -5,14 +5,18 @@
 from torch_save_utils import PINNED_BUFFER_MB
 from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
 
-def run(mb_size, folder, legacy_save, io_buffer_mb):
-    buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu')
+
+def run(mb_size, folder, legacy_save, io_buffer_mb, device):
+    buffer = torch.randint(high=128,
+                           size=(mb_size * (1024**2), ),
+                           dtype=torch.uint8,
+                           device=device)
 
     fn_dict = {
-        'test_save': test_save, 
-        'test_ds_mock_save': test_ds_mock_save, 
+        'test_save': test_save,
+        'test_ds_mock_save': test_ds_mock_save,
         'test_ds_py_save': test_ds_py_save,
-        'test_ds_fast_save':test_ds_fast_save
+        'test_ds_fast_save': test_ds_fast_save
     }
     for tag, fn in fn_dict.items():
         file = os.path.join(folder, f'{tag}_{mb_size}MB.pt')
@@ -21,11 +25,14 @@ def run(mb_size, folder, legacy_save, io_buffer_mb):
             os.remove(file)
         st = time.time()
         write_sec = fn(file, buffer, not legacy_save, io_buffer_mb)
-        gb_per_sec = mb_size/(1024.0*write_sec)
-        gb_size = os.path.getsize(file)/(1024**3)
-        print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s')
+        gb_per_sec = mb_size / (1024.0 * write_sec)
+        gb_size = os.path.getsize(file) / (1024**3)
+        print(
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s'
+        )
         print(f'*********************************************')
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--folder',
@@ -42,6 +49,8 @@ def parse_arguments():
                         action='store_true',
                         help='Use torch legacy save format')
 
+    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
+
     parser.add_argument('--io_buffer_mb',
                         type=int,
                         default=PINNED_BUFFER_MB,
@@ -52,15 +61,16 @@ def parse_arguments():
     return args
 
 
-
 def main():
     print(f'Performance test of deepspeed fast tensor checkpoint')
     args = parse_arguments()
     if not os.path.exists(args.folder):
         print(f'Invalid folder: {args.folder}')
         quit()
-    run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb)
-    
+
+    device = torch.cuda.current_device() if args.gpu else 'cpu'
+    run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb, device)
+
 
 if __name__ == "__main__":
     main()
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
index f75499f0c..be86e2206 100644
--- a/fast_io/model_checkpoint/torch_save_utils.py
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -5,23 +5,23 @@
 from deepspeed.ops.aio import AsyncIOBuilder
 from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter
 
-
 AIO_QUEUE_DEPTH = 8
-AIO_BLOCK_SIZE = 8*(1024**2)
+AIO_BLOCK_SIZE = 8 * (1024**2)
 AIO_THREAD_COUNT = 1
 AIO_SINGLE_SUBMIT = False
 AIO_OVERLAP_EVENTS = False
-PINNED_BUFFER_MB = 64 
+PINNED_BUFFER_MB = 64
+
 
 def _get_aio_handle():
-    h = AsyncIOBuilder().load().aio_handle(
-        block_size=AIO_BLOCK_SIZE,
-        queue_depth=AIO_QUEUE_DEPTH,
-        single_submit=AIO_SINGLE_SUBMIT,
-        overlap_events=AIO_SINGLE_SUBMIT,
-        num_threads=AIO_THREAD_COUNT)
+    h = AsyncIOBuilder().load().aio_handle(block_size=AIO_BLOCK_SIZE,
+                                           queue_depth=AIO_QUEUE_DEPTH,
+                                           single_submit=AIO_SINGLE_SUBMIT,
+                                           overlap_events=AIO_SINGLE_SUBMIT,
+                                           num_threads=AIO_THREAD_COUNT)
     return h
 
+
 def test_save(file, buffer, use_zipfile, io_buffer_mb):
     st = time.time()
     torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
@@ -31,31 +31,40 @@ def test_save(file, buffer, use_zipfile, io_buffer_mb):
 def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
     st = time.time()
     ds_mock_writer = MockFileWriter(file)
-    torch.save(f=ds_mock_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    ds_mock_writer.close() # Force flush to storage    
+    torch.save(f=ds_mock_writer,
+               obj=buffer,
+               _use_new_zipfile_serialization=use_zipfile)
+    ds_mock_writer.close()  # Force flush to storage
     write_sec = time.time() - st
     ds_mock_writer._dump_state()
-    return write_sec 
+    return write_sec
+
 
 def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
     st = time.time()
-    ds_py_writer = PyFileWriter(file)   
-    torch.save(f=ds_py_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    ds_py_writer.close() # Force flush to storage
+    ds_py_writer = PyFileWriter(file)
+    torch.save(f=ds_py_writer,
+               obj=buffer,
+               _use_new_zipfile_serialization=use_zipfile)
+    ds_py_writer.close()  # Force flush to storage
     write_sec = time.time() - st
     ds_py_writer._dump_state()
-    return write_sec 
+    return write_sec
+
 
 def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb):
     h = _get_aio_handle()
-    pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory()                                            
+    pinned_memory = torch.zeros(io_buffer_mb * (1024**2),
+                                dtype=torch.uint8,
+                                device='cpu').pin_memory()
     st = time.time()
-    ds_fast_writer = FastFileWriter(
-        file_path=file, 
-        aio_handle=h,
-        pinned_tensor=pinned_memory)
-    torch.save(f=ds_fast_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
-    ds_fast_writer.close() # Force flush to storage
+    ds_fast_writer = FastFileWriter(file_path=file,
+                                    aio_handle=h,
+                                    pinned_tensor=pinned_memory)
+    torch.save(f=ds_fast_writer,
+               obj=buffer,
+               _use_new_zipfile_serialization=use_zipfile)
+    ds_fast_writer.close()  # Force flush to storage
     write_sec = time.time() - st
     ds_fast_writer._dump_state()
-    return write_sec
\ No newline at end of file
+    return write_sec

From 315f02ab7bc01f3f92207cf7a29606dfc2f40e15 Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Tue, 4 Jan 2022 19:09:35 -0800
Subject: [PATCH 08/40] --half and more flexible options

---
 fast_io/model_checkpoint/torch_save_model.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index d38f5af67..b71450074 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -110,6 +110,10 @@ def parse_arguments():
 
     parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
 
+    parser.add_argument('--half',
+                        action='store_true',
+                        help='Use half-precision tensors.')
+
     parser.add_argument('--io_buffer_mb',
                         type=int,
                         default=PINNED_BUFFER_MB,
@@ -130,6 +134,11 @@ def validate_arguments(args):
         print(f'{args.model} is not a supported HF model tag')
         success = False
 
+    if args.optimizer and args.half:
+        if not args.gpu:
+            print(f'mixed precision only supported with gpu tensors')
+            success = False
+
     return success
 
 
@@ -143,14 +152,14 @@ def main():
         quit()
 
     model, model_name, ckpt_name = _get_model(args.model)
+    if args.half:
+        model = model.half()
+    if args.gpu:
+        model = model.cuda()
     if args.optimizer:
-        model = model.half().cuda()
         optimizer = _get_initialized_optimizer(model, args.fused)
         ckpt_state = {'model': model, 'optimizer': optimizer}
     else:
-        model = model.half()
-        if args.gpu:
-            model = model.cuda()
         ckpt_state = {'model': model}
     run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy,
         args.io_buffer_mb)

From a41ba080139427a21ea90f92aebadcb2ed8e910d Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Sat, 8 Jan 2022 10:45:49 -0800
Subject: [PATCH 09/40] Add deepspeed.save_checkpoint()

---
 .../model_checkpoint/deepspeed_save_model.py  | 117 ++++++++++++++++++
 fast_io/model_checkpoint/save_model_utils.py  | 114 +++++++++++++++++
 fast_io/model_checkpoint/torch_save_model.py  | 114 ++---------------
 fast_io/model_checkpoint/torch_save_tensor.py |  23 ++--
 fast_io/model_checkpoint/torch_save_utils.py  |  19 +--
 5 files changed, 268 insertions(+), 119 deletions(-)
 create mode 100644 fast_io/model_checkpoint/deepspeed_save_model.py
 create mode 100644 fast_io/model_checkpoint/save_model_utils.py

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
new file mode 100644
index 000000000..a9c9db2f6
--- /dev/null
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -0,0 +1,117 @@
+import time
+import torch
+import os
+import shutil
+import deepspeed
+from save_model_utils import get_model, validate_arguments, parse_arguments
+
+
+def _get_ds_config(args, writer_type):
+    ds_config = {
+        "train_micro_batch_size_per_gpu": 1,
+        "zero_optimization": {
+            "stage": args.zero_stage,
+            "cpu_offload": args.cpu_offload
+        },
+        "fp16": {
+            "enabled": args.half
+        },
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "torch_adam": not args.fused
+            }
+        },
+        "checkpoint": {
+            "checkpoint_serialization": not args.legacy
+        },
+        "aio": {
+            "block_size": 8 * (1024**2),
+            "queue_depth": 8,
+            "single_submit": False,
+            "overlap_events": False,
+            "thread_count": 1,
+        }
+    }
+
+    if writer_type:
+        ds_config["checkpoint"]["writer"] = {
+            "type": writer_type,
+            "io_buffer_size": args.io_buffer_mb * (1024**2),
+            "show_statistics": not args.no_statistics
+        }
+
+    return ds_config
+
+
+def _get_ds_engine(model, ds_config):
+    ds_engine, _, _, _ = deepspeed.initialize(
+        model=model, model_parameters=model.parameters(), config=ds_config)
+
+    return ds_engine
+
+
+def _do_optimizer_step(ds_engine):
+    for p in ds_engine.module.parameters():
+        p.grad = torch.zeros_like(p)
+    ds_engine.step()
+
+
+def test_save(tag, folder, model, args, writer_type):
+    ds_config = _get_ds_config(args, writer_type)
+    ds_engine = _get_ds_engine(model, ds_config)
+    if args.zero_stage == 0:
+        _do_optimizer_step(ds_engine)
+
+    st = time.time()
+    ds_engine.save_checkpoint(save_dir=folder, tag=tag)
+    write_sec = time.time() - st
+    return write_sec
+
+
+def _get_folder_size(folder):
+    size = 0
+    for path, _, files in os.walk(folder):
+        size += sum([os.path.getsize(os.path.join(path, f)) for f in files])
+    return size
+
+
+def run(model, model_name, ckpt_name, args):
+    print(f'Model name = {model_name}')
+    writer_dict = {
+        'test_save': None,
+        'test_ds_mock_save': 'mock',
+        'test_ds_py_save': 'python',
+        'test_ds_fast_save': 'fast'
+    }
+    for tag, writer_type in writer_dict.items():
+        folder = os.path.join(args.folder, ckpt_name, tag)
+        if os.path.exists(folder):
+            shutil.rmtree(folder)
+        write_sec = test_save(tag, folder, model, args, writer_type)
+        ckpt_size = _get_folder_size(folder)
+        gb_size = ckpt_size / (1024**3)
+        gb_per_sec = gb_size / write_sec
+        print(
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s'
+        )
+        print(f'*********************************************')
+
+
+def main():
+    print(
+        f'Performance test of deepspeed integration of fast model checkpointing.'
+    )
+    print(f'torch version = {torch.__version__}')
+    torch.manual_seed(42)
+
+    args = parse_arguments()
+    if not validate_arguments(args):
+        quit()
+
+    model, model_name, ckpt_name = get_model(args.model)
+    run(model, model_name, ckpt_name, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
new file mode 100644
index 000000000..c19062d13
--- /dev/null
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -0,0 +1,114 @@
+import argparse
+import os
+from transformers import AutoModelForCausalLM
+from transformers import T5ForConditionalGeneration
+from torch_save_utils import PINNED_BUFFER_MB
+
+
+def _get_gpt_j_6B(tag):
+    model_name = "EleutherAI/gpt-j-6B"
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    ckpt_name = "gpt-j-6B"
+    return model, model_name, ckpt_name
+
+
+def _get_tiny_t5(tag):
+    model_name = "hf-internal-testing/tiny-random-t5"
+    model = T5ForConditionalGeneration.from_pretrained(model_name)
+    ckpt_name = "tiny-random-t5"
+    return model, model_name, ckpt_name
+
+
+def _get_hf_gpt2(tag):
+    model_name = tag
+    model = AutoModelForCausalLM.from_pretrained(tag)
+    ckpt_name = tag
+    return model, model_name, ckpt_name
+
+
+HF_MODELS = {
+    'tiny-t5': _get_tiny_t5,
+    'gpt-j-6B': _get_gpt_j_6B,
+    'gpt2': _get_hf_gpt2,
+    'gpt2-large': _get_hf_gpt2,
+    'gpt2-xl': _get_hf_gpt2,
+}
+
+
+def get_model(model_tag):
+    return HF_MODELS[model_tag](model_tag)
+
+
+def validate_arguments(args):
+    success = True
+    if not os.path.exists(args.folder):
+        print(f'Invalid folder: {args.folder}')
+        success = False
+
+    if not args.model in HF_MODELS:
+        print(f'{args.model} is not a supported HF model tag')
+        success = False
+
+    if args.optimizer and args.half:
+        if not args.gpu:
+            print(f'mixed precision only supported with gpu tensors')
+            success = False
+
+    return success
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='Folder to use for I/O.')
+
+    parser.add_argument(
+        '--model',
+        default=None,
+        type=str,
+        required=True,
+        help='Hugging Face transformers tag of model (e.g., gpt2).')
+
+    parser.add_argument('--legacy',
+                        action='store_true',
+                        help='Use torch legacy save format')
+
+    parser.add_argument('--optimizer',
+                        action='store_true',
+                        help='Include optimizer state in checkpoint.')
+
+    parser.add_argument('--fused',
+                        action='store_true',
+                        help='Use fused fp16 optimizer.')
+
+    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
+
+    parser.add_argument('--half',
+                        action='store_true',
+                        help='Use half-precision tensors.')
+
+    parser.add_argument(
+        '--io_buffer_mb',
+        type=int,
+        default=PINNED_BUFFER_MB,
+        help=f'Size of pinned i/o buffer in MB. Default = {PINNED_BUFFER_MB}')
+
+    parser.add_argument('--zero_stage',
+                        type=int,
+                        default=0,
+                        help='ZeRO optimization stage. Default = 0')
+
+    parser.add_argument('--cpu_offload',
+                        action='store_true',
+                        help='Enable CPU offload of optimizer state.')
+
+    parser.add_argument('--no-statistics',
+                        action='store_true',
+                        help='Suppress low-level performance statistics.')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index b71450074..1375f048b 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -1,49 +1,13 @@
 import time
-import argparse
 import torch
 from torch.optim import Adam
 import os
-from transformers import AutoModelForCausalLM
-from transformers import T5ForConditionalGeneration
-from torch_save_utils import PINNED_BUFFER_MB
 from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
+from save_model_utils import get_model, validate_arguments, parse_arguments
 
 
-def _get_gpt_j_6B(tag):
-    model_name = "EleutherAI/gpt-j-6B"
-    model = AutoModelForCausalLM.from_pretrained(model_name)  #.half()
-    ckpt_name = "gpt-j-6B"
-    return model, model_name, ckpt_name
-
-
-def _get_tiny_t5(tag):
-    model_name = "hf-internal-testing/tiny-random-t5"
-    model = T5ForConditionalGeneration.from_pretrained(model_name)  #.half()
-    ckpt_name = "tiny-random-t5"
-    return model, model_name, ckpt_name
-
-
-def _get_hf_gpt2(tag):
-    model_name = tag
-    model = AutoModelForCausalLM.from_pretrained(tag)
-    ckpt_name = tag
-    return model, model_name, ckpt_name
-
-
-HF_MODELS = {
-    'tiny-random-t5': _get_tiny_t5,
-    'gpt-j-6B': _get_gpt_j_6B,
-    'gpt2': _get_hf_gpt2,
-    'gpt2-large': _get_hf_gpt2,
-    'gpt2-xl': _get_hf_gpt2,
-}
-
-
-def _get_model(model_tag):
-    return HF_MODELS[model_tag](model_tag)
-
-
-def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
+def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb,
+        show_statistics):
     print(f'Model name = {model_name}')
     fn_dict = {
         'test_save': test_save,
@@ -57,7 +21,8 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb):
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, model, not legacy_save, io_buffer_mb)
+        write_sec = fn(file, model, not legacy_save, io_buffer_mb,
+                       show_statistics)
         ckpt_size = os.path.getsize(file)
         gb_size = ckpt_size / (1024**3)
         gb_per_sec = gb_size / write_sec
@@ -81,69 +46,10 @@ def _get_initialized_optimizer(model, fused_opt):
     return optimizer
 
 
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--folder',
-                        default=None,
-                        type=str,
-                        required=True,
-                        help='Folder to use for I/O.')
-
-    parser.add_argument(
-        '--model',
-        default=None,
-        type=str,
-        required=True,
-        help='Hugging Face transformers tag of model (e.g., gpt2).')
-
-    parser.add_argument('--legacy',
-                        action='store_true',
-                        help='Use torch legacy save format')
-
-    parser.add_argument('--optimizer',
-                        action='store_true',
-                        help='Include optimizer state in checkpoint.')
-
-    parser.add_argument('--fused',
-                        action='store_true',
-                        help='Use fused fp16 optimizer.')
-
-    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
-
-    parser.add_argument('--half',
-                        action='store_true',
-                        help='Use half-precision tensors.')
-
-    parser.add_argument('--io_buffer_mb',
-                        type=int,
-                        default=PINNED_BUFFER_MB,
-                        help='Size of pinned i/o buffer in MB.')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def validate_arguments(args):
-    success = True
-    if not os.path.exists(args.folder):
-        print(f'Invalid folder: {args.folder}')
-        success = False
-
-    if not args.model in HF_MODELS:
-        print(f'{args.model} is not a supported HF model tag')
-        success = False
-
-    if args.optimizer and args.half:
-        if not args.gpu:
-            print(f'mixed precision only supported with gpu tensors')
-            success = False
-
-    return success
-
-
 def main():
-    print(f'Performance test of deepspeed fast model checkpoint')
+    print(
+        f'Performance test of torch.save() integration of fast model checkpointing.'
+    )
     print(f'torch version = {torch.__version__}')
     torch.manual_seed(42)
 
@@ -151,7 +57,7 @@ def main():
     if not validate_arguments(args):
         quit()
 
-    model, model_name, ckpt_name = _get_model(args.model)
+    model, model_name, ckpt_name = get_model(args.model)
     if args.half:
         model = model.half()
     if args.gpu:
@@ -162,7 +68,7 @@ def main():
     else:
         ckpt_state = {'model': model}
     run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy,
-        args.io_buffer_mb)
+        args.io_buffer_mb, not args.no_statistics)
 
 
 if __name__ == "__main__":
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 749c6ea31..0ecf0e6ef 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -6,9 +6,10 @@
 from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
 
 
-def run(mb_size, folder, legacy_save, io_buffer_mb, device):
+def run(args):
+    device = torch.cuda.current_device() if args.gpu else 'cpu'
     buffer = torch.randint(high=128,
-                           size=(mb_size * (1024**2), ),
+                           size=(args.mb_size * (1024**2), ),
                            dtype=torch.uint8,
                            device=device)
 
@@ -19,13 +20,14 @@ def run(mb_size, folder, legacy_save, io_buffer_mb, device):
         'test_ds_fast_save': test_ds_fast_save
     }
     for tag, fn in fn_dict.items():
-        file = os.path.join(folder, f'{tag}_{mb_size}MB.pt')
+        file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt')
         print(f'checkpoint file = {file}')
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, buffer, not legacy_save, io_buffer_mb)
-        gb_per_sec = mb_size / (1024.0 * write_sec)
+        write_sec = fn(file, buffer, not args.legacy, args.io_buffer_mb,
+                       not args.no_statistics)
+        gb_per_sec = args.mb_size / (1024.0 * write_sec)
         gb_size = os.path.getsize(file) / (1024**3)
         print(
             f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s'
@@ -56,20 +58,25 @@ def parse_arguments():
                         default=PINNED_BUFFER_MB,
                         help='Size of pinned i/o buffer in MB.')
 
+    parser.add_argument('--no-statistics',
+                        action='store_true',
+                        help='Suppress low-level performance statistics.')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args
 
 
 def main():
-    print(f'Performance test of deepspeed fast tensor checkpoint')
+    print(
+        f'Performance test of torch.save() integration of fast tensor checkpointing.'
+    )
     args = parse_arguments()
     if not os.path.exists(args.folder):
         print(f'Invalid folder: {args.folder}')
         quit()
 
-    device = torch.cuda.current_device() if args.gpu else 'cpu'
-    run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb, device)
+    run(args)
 
 
 if __name__ == "__main__":
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
index be86e2206..3455a1a60 100644
--- a/fast_io/model_checkpoint/torch_save_utils.py
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -22,13 +22,14 @@ def _get_aio_handle():
     return h
 
 
-def test_save(file, buffer, use_zipfile, io_buffer_mb):
+def test_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics):
     st = time.time()
     torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
     return time.time() - st
 
 
-def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
+def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb,
+                      show_statistics):
     st = time.time()
     ds_mock_writer = MockFileWriter(file)
     torch.save(f=ds_mock_writer,
@@ -36,11 +37,12 @@ def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb):
                _use_new_zipfile_serialization=use_zipfile)
     ds_mock_writer.close()  # Force flush to storage
     write_sec = time.time() - st
-    ds_mock_writer._dump_state()
+    if show_statistics:
+        ds_mock_writer._dump_state()
     return write_sec
 
 
-def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
+def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics):
     st = time.time()
     ds_py_writer = PyFileWriter(file)
     torch.save(f=ds_py_writer,
@@ -48,11 +50,13 @@ def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb):
                _use_new_zipfile_serialization=use_zipfile)
     ds_py_writer.close()  # Force flush to storage
     write_sec = time.time() - st
-    ds_py_writer._dump_state()
+    if show_statistics:
+        ds_py_writer._dump_state()
     return write_sec
 
 
-def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb):
+def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb,
+                      show_statistics):
     h = _get_aio_handle()
     pinned_memory = torch.zeros(io_buffer_mb * (1024**2),
                                 dtype=torch.uint8,
@@ -66,5 +70,6 @@ def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb):
                _use_new_zipfile_serialization=use_zipfile)
     ds_fast_writer.close()  # Force flush to storage
     write_sec = time.time() - st
-    ds_fast_writer._dump_state()
+    if show_statistics:
+        ds_fast_writer._dump_state()
     return write_sec

From 4fcb06040b185f08f6273736054cbef08eb8e2a0 Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Sat, 8 Jan 2022 12:46:06 -0800
Subject: [PATCH 10/40] Free ds memory

---
 fast_io/model_checkpoint/deepspeed_save_model.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index a9c9db2f6..7cad72f36 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -2,6 +2,7 @@
 import torch
 import os
 import shutil
+import gc
 import deepspeed
 from save_model_utils import get_model, validate_arguments, parse_arguments
 
@@ -57,6 +58,16 @@ def _do_optimizer_step(ds_engine):
     ds_engine.step()
 
 
+def _free_ds_memory(ds_engine):
+    ds_engine.optimizer.optimizer = None
+    ds_engine.optimizer = None
+    ds_engine.module = None
+    ds_engine = None
+    del ds_engine
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
 def test_save(tag, folder, model, args, writer_type):
     ds_config = _get_ds_config(args, writer_type)
     ds_engine = _get_ds_engine(model, ds_config)
@@ -66,6 +77,7 @@ def test_save(tag, folder, model, args, writer_type):
     st = time.time()
     ds_engine.save_checkpoint(save_dir=folder, tag=tag)
     write_sec = time.time() - st
+    _free_ds_memory(ds_engine)
     return write_sec
 
 
From a49c5424977196ee0bc0f55dcfbec5591315294a Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Sat, 8 Jan 2022 13:24:24 -0800
Subject: [PATCH 11/40] Improve repro

---
 fast_io/model_checkpoint/deepspeed_save_model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index 7cad72f36..4cf6b4ae1 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -3,6 +3,8 @@
 import os
 import shutil
 import gc
+import random
+import numpy as np
 import deepspeed
 from save_model_utils import get_model, validate_arguments, parse_arguments
 
@@ -116,7 +118,8 @@ def main():
     )
     print(f'torch version = {torch.__version__}')
     torch.manual_seed(42)
-
+    np.random.seed(0)
+    random.seed(0)
     args = parse_arguments()
     if not validate_arguments(args):
         quit()

From 233b9e92bb7fa63bc6609a0f652418bf4d53ca4e Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 22 Feb 2022 12:13:57 -0800
Subject: [PATCH 12/40] Double I/O buffer (#56)

---
 .../model_checkpoint/deepspeed_save_model.py  |  3 +-
 fast_io/model_checkpoint/save_model_utils.py  |  4 +++
 fast_io/model_checkpoint/torch_save_model.py  | 11 +++----
 fast_io/model_checkpoint/torch_save_tensor.py |  7 +++--
 fast_io/model_checkpoint/torch_save_utils.py  | 31 ++++++++++---------
 5 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index 4cf6b4ae1..081fe0299 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -41,7 +41,8 @@ def _get_ds_config(args, writer_type):
         ds_config["checkpoint"]["writer"] = {
             "type": writer_type,
             "io_buffer_size": args.io_buffer_mb * (1024**2),
-            "show_statistics": not args.no_statistics
+            "io_buffer_double": not args.single_io_buffer,
+            "show_statistics": not args.no_statistics,
         }
 
     return ds_config
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index c19062d13..24f9f87d1 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -109,6 +109,10 @@ def parse_arguments():
                         action='store_true',
                         help='Suppress low-level performance statistics.')
 
+    parser.add_argument('--single_io_buffer',
+                        action='store_true',
+                        help='Disable double buffering of i/o buffer.')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args
diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index 1375f048b..245d49e30 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -6,8 +6,7 @@
 from save_model_utils import get_model, validate_arguments, parse_arguments
 
 
-def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb,
-        show_statistics):
+def run(model, model_name, ckpt_name, args):
     print(f'Model name = {model_name}')
     fn_dict = {
         'test_save': test_save,
@@ -16,13 +15,12 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb,
         'test_ds_fast_save': test_ds_fast_save
     }
     for tag, fn in fn_dict.items():
-        file = os.path.join(folder, f'{tag}_{ckpt_name}.pt')
+        file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt')
         print(f'checkpoint file = {file}')
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, model, not legacy_save, io_buffer_mb,
-                       show_statistics)
+        write_sec = fn(file, model, args)
         ckpt_size = os.path.getsize(file)
         gb_size = ckpt_size / (1024**3)
         gb_per_sec = gb_size / write_sec
@@ -67,8 +65,7 @@ def main():
         ckpt_state = {'model': model, 'optimizer': optimizer}
     else:
         ckpt_state = {'model': model}
-    run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy,
-        args.io_buffer_mb, not args.no_statistics)
+    run(ckpt_state, model_name, ckpt_name, args)
 
 
 if __name__ == "__main__":
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 0ecf0e6ef..80d5f1358 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -25,8 +25,7 @@ def run(args):
         if os.path.isfile(file):
             os.remove(file)
         st = time.time()
-        write_sec = fn(file, buffer, not args.legacy, args.io_buffer_mb,
-                       not args.no_statistics)
+        write_sec = fn(file, buffer, args)
         gb_per_sec = args.mb_size / (1024.0 * write_sec)
         gb_size = os.path.getsize(file) / (1024**3)
         print(
@@ -62,6 +61,10 @@ def parse_arguments():
                         action='store_true',
                         help='Suppress low-level performance statistics.')
 
+    parser.add_argument('--single_io_buffer',
+                        action='store_true',
+                        help='Disable double buffering of i/o buffer.')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
index 3455a1a60..c01fd014c 100644
--- a/fast_io/model_checkpoint/torch_save_utils.py
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -22,54 +22,55 @@ def _get_aio_handle():
     return h
 
 
-def test_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics):
+def test_save(file, buffer, args):
     st = time.time()
-    torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile)
+    torch.save(f=file,
+               obj=buffer,
+               _use_new_zipfile_serialization=not args.legacy)
     return time.time() - st
 
 
-def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb,
-                      show_statistics):
+def test_ds_mock_save(file, buffer, args):
     st = time.time()
     ds_mock_writer = MockFileWriter(file)
     torch.save(f=ds_mock_writer,
                obj=buffer,
-               _use_new_zipfile_serialization=use_zipfile)
+               _use_new_zipfile_serialization=not args.legacy)
     ds_mock_writer.close()  # Force flush to storage
     write_sec = time.time() - st
-    if show_statistics:
+    if not args.no_statistics:
         ds_mock_writer._dump_state()
     return write_sec
 
 
-def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics):
+def test_ds_py_save(file, buffer, args):
     st = time.time()
     ds_py_writer = PyFileWriter(file)
     torch.save(f=ds_py_writer,
                obj=buffer,
-               _use_new_zipfile_serialization=use_zipfile)
+               _use_new_zipfile_serialization=not args.legacy)
     ds_py_writer.close()  # Force flush to storage
     write_sec = time.time() - st
-    if show_statistics:
+    if not args.no_statistics:
         ds_py_writer._dump_state()
     return write_sec
 
 
-def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb,
-                      show_statistics):
+def test_ds_fast_save(file, buffer, args):
     h = _get_aio_handle()
-    pinned_memory = torch.zeros(io_buffer_mb * (1024**2),
+    pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2),
                                 dtype=torch.uint8,
                                 device='cpu').pin_memory()
     st = time.time()
     ds_fast_writer = FastFileWriter(file_path=file,
                                     aio_handle=h,
-                                    pinned_tensor=pinned_memory)
+                                    pinned_tensor=pinned_memory,
+                                    double_buffer=not args.single_io_buffer)
     torch.save(f=ds_fast_writer,
                obj=buffer,
-               _use_new_zipfile_serialization=use_zipfile)
+               _use_new_zipfile_serialization=not args.legacy)
     ds_fast_writer.close()  # Force flush to storage
     write_sec = time.time() - st
-    if show_statistics:
+    if not args.no_statistics:
         ds_fast_writer._dump_state()
     return write_sec

From b1f02b21b0643083a982d8f9f2cebc640d6a2f5b Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 11 Mar 2022 13:07:03 -0800
Subject: [PATCH 13/40] Double I/O buffer (#60)


From a16ac9eed8f8401cb265edcd639d4d6c3572c083 Mon Sep 17 00:00:00 2001
From: jerryyangli <jerryyangli@gmail.com>
Date: Tue, 15 Mar 2022 06:51:29 -0700
Subject: [PATCH 14/40] Add checkpoint comparison (#62)

* Add checkpoint comparison

* Corrected a typo

Co-authored-by: Yang Li <yangli2@microsoft.com>
---
 .../model_checkpoint/checkpoint_compare.py    | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 fast_io/model_checkpoint/checkpoint_compare.py

diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/fast_io/model_checkpoint/checkpoint_compare.py
new file mode 100644
index 000000000..cc67b61d9
--- /dev/null
+++ b/fast_io/model_checkpoint/checkpoint_compare.py
@@ -0,0 +1,123 @@
+#This script is for testing whether two checkpoints match; it prints all the differences
+
+import torch
+import os
+import sys
+import pickle
+from collections import OrderedDict
+
+exclude_key_str = {'ds_config/checkpoint/writer'}
+
+def main():
+    dir1 = sys.argv[1]
+    dir2 = sys.argv[2]
+    print ("Begin comparison")
+    print ("The first directory {}" .format(dir1))
+    print ("The second directory {}" .format(dir2))
+    print (' ')
+
+    file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))]
+    file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))]
+    common_files = []
+    
+    for f in file_list1:
+        if not (f in file_list2):
+            log_error_file_mismatch_first(f)
+        else:
+            common_files.append(f)
+    for f in file_list2:
+        if not (f in file_list1):
+            log_error_file_mismatch_second(f)
+    
+    for f in common_files:
+        full_dir1 = os.path.join(dir1, f)
+        full_dir2 = os.path.join(dir2, f)
+        print ("Begin comparison")
+        print("The first checkpoint {}" .format(full_dir1))
+        print("The second checkpoint {}" .format(full_dir2))
+        print(' ')
+        model_first = torch.load(full_dir1)
+        model_second = torch.load(full_dir2)
+        object_compare(model_first, model_second, [])
+
+
+def object_compare(model_first, model_second, key_chain):
+    if not (type(model_first) == type(model_second)):
+        log_error_value_mismatch(model_first, model_second, key_chain)
+        return
+
+    if type(model_first) is list:
+        if len(model_first) != len(model_second):
+            log_error_value_mismatch(model_first, model_second, key_chain)
+            return
+        for i in range(len(model_first)):
+            object_compare(model_first[i], model_second[i], key_chain)
+        return
+
+    if type(model_first) is dict or type(model_first) is OrderedDict:
+        common_keys = []
+        for key in model_first:
+            if key not in model_second:
+                key_chain.append(key)
+                log_error_key_mismatch_first(model_first[key], key_chain)
+                key_chain.pop()
+            else:
+                common_keys.append(key)
+                
+        for key in model_second:
+            if key not in model_first:
+                key_chain.append(key)
+                log_error_key_mismatch_second(model_second[key], key_chain) 
+                key_chain.pop()
+                
+        for key in common_keys:
+            key_chain.append(key)
+            object_compare(model_first[key], model_second[key], key_chain)
+            key_chain.pop()
+        return
+	
+    if hasattr(model_first, '__dict__'):
+        equality = (model_first.__dict__ == model_second.__dict__)
+    else:
+        equality = (model_first == model_second)
+    if type(equality) is not bool:
+        equality = (equality.all())
+    if not equality:
+        log_error_value_mismatch(model_first, model_second, key_chain)
+    return    
+
+
+def log_error_file_mismatch_first(filename):
+    print("The following file appeared in the first but not the second directory: {}" .format(filename))
+    print(' ')
+    
+
+def log_error_file_mismatch_second(filename):
+    print("The following key appeared in the second but not the first directory: {}" .format(filename))
+    print(" ")
+
+
+def log_error_key_mismatch_first(model, key_chain):
+    key_str = "/".join(key_chain)
+    if not (key_str in exclude_key_str):
+        print("The following key appeared in the first but not the second model: {}" .format(key_str))
+        print("The value of the first model is: {}" .format(model))
+        print(" ") 
+
+
+def log_error_key_mismatch_second(model, key_chain):
+    key_str = "/".join(key_chain)
+    if not (key_str in exclude_key_str):
+        print("The following key appeared in the second but not the first model: {}" .format(key_str))
+        print("The value of the second model is: {}" .format(model))
+        print(" ") 
+
+
+def log_error_value_mismatch(model_first, model_second, key_chain):
+    print ("The values of the following key do not match: {}" .format("/".join(key_chain)))
+    print ("The value of the first model is: {}" .format(model_first))
+    print ("The value of the second model is: {}" .format(model_second))
+    print(" ")
+
+if __name__ == "__main__":
+    main()

From b945adc06ddea9a646a91a785e119b8c10b3ab27 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sat, 19 Mar 2022 16:47:46 +0000
Subject: [PATCH 15/40] save_checkpoint perf monitoring

---
 Megatron-LM-v1.1.5-ZeRO3/megatron/training.py | 28 +++++++++++++------
 Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py    | 19 +++++++++++++
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
index 0245caea4..68ce5c4c4 100644
--- a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
+++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
@@ -38,7 +38,7 @@
 from megatron.model.realm_model import ICTBertModel
 from megatron.utils import check_adlr_autoresume_termination
 from megatron.utils import make_data_loader
-from megatron.utils import report_memory, flops_calculator
+from megatron.utils import report_memory, flops_calculator, throughput_calculator, checkpoint_throughput_calculator
 
 import deepspeed
 from deepspeed.runtime.utils import see_memory_usage
@@ -384,6 +384,7 @@ def add_to_logging(name):
     add_to_logging('backward-clip-grad')
     add_to_logging('optimizer')
     add_to_logging('batch generator')
+    add_to_logging('save checkpoint')
 
     # Tensorboard values.
     if writer and torch.distributed.get_rank() == 0:
@@ -423,12 +424,14 @@ def add_to_logging(name):
             total_loss_dict[got_nan_key])
         total_loss_dict[skipped_iters_key] = 0
         total_loss_dict[got_nan_key] = 0
+        timers.log(timers_to_log, normalizer=args.log_interval)
         print_rank_0(log_string)
         if report_memory_flag:
             report_memory('after {} iterations'.format(iteration))
             report_memory_flag = False
-        timers.log(timers_to_log, normalizer=args.log_interval)
+
         flops_calculator(model, args, elapsed_time)
+        throughput_calculator(model, args, elapsed_time)      
 
     return report_memory_flag
 
@@ -462,11 +465,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
         loss_scale = None
         if args.fp16:
             loss_scale = optimizer.cur_scale if args.deepspeed else optimizer.loss_scale
-        report_memory_flag = training_log(loss_dict, total_loss_dict,
-                                          optimizer.param_groups[0]['lr'],
-                                          iteration, loss_scale,
-                                          report_memory_flag, skipped_iter,
-                                          model=model)
 
         # Autoresume
         if args.adlr_autoresume and \
@@ -475,9 +473,21 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
                                               lr_scheduler)
 
         # Checkpointing
-        if args.save and args.save_interval and \
-           iteration % args.save_interval == 0:
+        should_save_checkpoint = args.save and args.save_interval and \
+           iteration % args.save_interval == 0           
+        timers('save checkpoint').start()
+        if should_save_checkpoint:
             save_checkpoint(iteration, model, optimizer, lr_scheduler)
+        timers('save checkpoint').stop()
+
+        if should_save_checkpoint:
+            checkpoint_throughput_calculator(model, args, timers('save checkpoint').elapsed(reset=False))
+
+        report_memory_flag = training_log(loss_dict, total_loss_dict,
+                                          optimizer.param_groups[0]['lr'],
+                                          iteration, loss_scale,
+                                          report_memory_flag, skipped_iter,
+                                          model=model)
 
         # Evaluation
         # XXX temporarily disabled for ZeRO-3
diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py
index 86fcf5ed4..9880f12ca 100644
--- a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py
+++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py
@@ -194,3 +194,22 @@ def flops_calculator(model, args, iteration_time):
     effective_tera_flops_per_gpu = giga_flops_per_model_per_train_step / (iteration_time * 1000.0 * gpus_per_model)
 
     print_rank_0(f"Effective Tera Flops per GPU: {round(effective_tera_flops_per_gpu, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B")
+
+
+def throughput_calculator(model, args, iteration_time):
+    gpus_per_model = torch.distributed.get_world_size(group = mpu.get_model_parallel_group())
+    samples_per_model = args.batch_size * args.seq_length
+    model_replica_count = torch.distributed.get_world_size() / gpus_per_model
+    approx_parameters_in_billions = get_parameters_in_billions(model)
+    samples_per_second = samples_per_model * model_replica_count / (iteration_time * 1000.0)
+
+    print_rank_0(f'Samples per second: {round(samples_per_second, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B')
+
+
+def checkpoint_throughput_calculator(model, args, latency_sec):
+    approx_parameters_in_billions = get_parameters_in_billions(model)
+    checkpoint_multiplier = 12 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4)
+    checkpoint_giga_bytes = approx_parameters_in_billions * checkpoint_multiplier
+    giga_bytes_per_second = checkpoint_giga_bytes / latency_sec
+    
+    print_rank_0(f'Checkpoint Save GB: {round(checkpoint_giga_bytes, 3)}, GB_PerSec: {round(giga_bytes_per_second, 2)}, Latency(secs): {round(latency_sec, 3)}')

From 64a8f75ccc05de25fe691320e961f81a7a113601 Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Tue, 22 Mar 2022 18:36:02 +0000
Subject: [PATCH 16/40] Disable checkpoint save on exit

---
 Megatron-LM-v1.1.5-ZeRO3/megatron/training.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
index 68ce5c4c4..bda88cc25 100644
--- a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
+++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
@@ -106,8 +106,8 @@ def pretrain(train_valid_test_dataset_provider, model_provider,
                                    valid_data_iterator, model,
                                    iteration, False)
 
-    if args.save and iteration != 0:
-        save_checkpoint(iteration, model, optimizer, lr_scheduler)
+#    if args.save and iteration != 0:
+#        save_checkpoint(iteration, model, optimizer, lr_scheduler)
 
     if args.do_test:
         # Run on test data.
@@ -175,8 +175,8 @@ def get_optimizer(model):
                                        weight_decay=args.weight_decay)
     else:
         # Use torch Adam instead of Fused Adam from NVIDIA which seems to have some issue.
-        #optimizer = Adam(param_groups,
-        optimizer = torch.optim.AdamW(param_groups,
+        optimizer = Adam(param_groups,
+        #optimizer = torch.optim.AdamW(param_groups,
                          lr=args.lr,
                          weight_decay=args.weight_decay,
                          betas=(args.adam_beta1, args.adam_beta2),

From 44b8664274fd06176269d6c4777a7292bb3fddbd Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 22 Mar 2022 11:41:32 -0700
Subject: [PATCH 17/40] Perf statistics for save_checkpoint (#64)

* save_checkpoint perf monitoring

* Disable checkpoint save on exit
---
 Megatron-LM-v1.1.5-ZeRO3/megatron/training.py | 36 ++++++++++++-------
 Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py    | 19 ++++++++++
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
index 0245caea4..bda88cc25 100644
--- a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
+++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
@@ -38,7 +38,7 @@
 from megatron.model.realm_model import ICTBertModel
 from megatron.utils import check_adlr_autoresume_termination
 from megatron.utils import make_data_loader
-from megatron.utils import report_memory, flops_calculator
+from megatron.utils import report_memory, flops_calculator, throughput_calculator, checkpoint_throughput_calculator
 
 import deepspeed
 from deepspeed.runtime.utils import see_memory_usage
@@ -106,8 +106,8 @@ def pretrain(train_valid_test_dataset_provider, model_provider,
                                    valid_data_iterator, model,
                                    iteration, False)
 
-    if args.save and iteration != 0:
-        save_checkpoint(iteration, model, optimizer, lr_scheduler)
+#    if args.save and iteration != 0:
+#        save_checkpoint(iteration, model, optimizer, lr_scheduler)
 
     if args.do_test:
         # Run on test data.
@@ -175,8 +175,8 @@ def get_optimizer(model):
                                        weight_decay=args.weight_decay)
     else:
         # Use torch Adam instead of Fused Adam from NVIDIA which seems to have some issue.
-        #optimizer = Adam(param_groups,
-        optimizer = torch.optim.AdamW(param_groups,
+        optimizer = Adam(param_groups,
+        #optimizer = torch.optim.AdamW(param_groups,
                          lr=args.lr,
                          weight_decay=args.weight_decay,
                          betas=(args.adam_beta1, args.adam_beta2),
@@ -384,6 +384,7 @@ def add_to_logging(name):
     add_to_logging('backward-clip-grad')
     add_to_logging('optimizer')
     add_to_logging('batch generator')
+    add_to_logging('save checkpoint')
 
     # Tensorboard values.
     if writer and torch.distributed.get_rank() == 0:
@@ -423,12 +424,14 @@ def add_to_logging(name):
             total_loss_dict[got_nan_key])
         total_loss_dict[skipped_iters_key] = 0
         total_loss_dict[got_nan_key] = 0
+        timers.log(timers_to_log, normalizer=args.log_interval)
         print_rank_0(log_string)
         if report_memory_flag:
             report_memory('after {} iterations'.format(iteration))
             report_memory_flag = False
-        timers.log(timers_to_log, normalizer=args.log_interval)
+
         flops_calculator(model, args, elapsed_time)
+        throughput_calculator(model, args, elapsed_time)      
 
     return report_memory_flag
 
@@ -462,11 +465,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
         loss_scale = None
         if args.fp16:
             loss_scale = optimizer.cur_scale if args.deepspeed else optimizer.loss_scale
-        report_memory_flag = training_log(loss_dict, total_loss_dict,
-                                          optimizer.param_groups[0]['lr'],
-                                          iteration, loss_scale,
-                                          report_memory_flag, skipped_iter,
-                                          model=model)
 
         # Autoresume
         if args.adlr_autoresume and \
@@ -475,9 +473,21 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
                                               lr_scheduler)
 
         # Checkpointing
-        if args.save and args.save_interval and \
-           iteration % args.save_interval == 0:
+        should_save_checkpoint = args.save and args.save_interval and \
+           iteration % args.save_interval == 0           
+        timers('save checkpoint').start()
+        if should_save_checkpoint:
             save_checkpoint(iteration, model, optimizer, lr_scheduler)
+        timers('save checkpoint').stop()
+
+        if should_save_checkpoint:
+            checkpoint_throughput_calculator(model, args, timers('save checkpoint').elapsed(reset=False))
+
+        report_memory_flag = training_log(loss_dict, total_loss_dict,
+                                          optimizer.param_groups[0]['lr'],
+                                          iteration, loss_scale,
+                                          report_memory_flag, skipped_iter,
+                                          model=model)
 
         # Evaluation
         # XXX temporarily disabled for ZeRO-3
diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py
index 86fcf5ed4..9880f12ca 100644
--- a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py
+++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py
@@ -194,3 +194,22 @@ def flops_calculator(model, args, iteration_time):
     effective_tera_flops_per_gpu = giga_flops_per_model_per_train_step / (iteration_time * 1000.0 * gpus_per_model)
 
     print_rank_0(f"Effective Tera Flops per GPU: {round(effective_tera_flops_per_gpu, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B")
+
+
+def throughput_calculator(model, args, iteration_time):
+    gpus_per_model = torch.distributed.get_world_size(group = mpu.get_model_parallel_group())
+    samples_per_model = args.batch_size * args.seq_length
+    model_replica_count = torch.distributed.get_world_size() / gpus_per_model
+    approx_parameters_in_billions = get_parameters_in_billions(model)
+    samples_per_second = samples_per_model * model_replica_count / (iteration_time * 1000.0)
+
+    print_rank_0(f'Samples per second: {round(samples_per_second, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B')
+
+
+def checkpoint_throughput_calculator(model, args, latency_sec):
+    approx_parameters_in_billions = get_parameters_in_billions(model)
+    checkpoint_multiplier = 12 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4)
+    checkpoint_giga_bytes = approx_parameters_in_billions * checkpoint_multiplier
+    giga_bytes_per_second = checkpoint_giga_bytes / latency_sec
+    
+    print_rank_0(f'Checkpoint Save GB: {round(checkpoint_giga_bytes, 3)}, GB_PerSec: {round(giga_bytes_per_second, 2)}, Latency(secs): {round(latency_sec, 3)}')

From ff4bd69edb15acbd94881e73b41c1c54e50dec95 Mon Sep 17 00:00:00 2001
From: GuanhuaWang <guanhua@cs.berkeley.edu>
Date: Wed, 21 Sep 2022 18:48:16 +0000
Subject: [PATCH 18/40] add logs for a100-80

---
 .../log_9_21_22/gpt2-unfused.txt              | 599 ++++++++++++++
 .../log_9_21_22/gpt2_fused_z2.txt             | 781 ++++++++++++++++++
 2 files changed, 1380 insertions(+)
 create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
 create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt

diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
new file mode 100644
index 000000000..33985e8db
--- /dev/null
+++ b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
@@ -0,0 +1,599 @@
+Performance test of deepspeed integration of fast model checkpointing.
+torch version = 1.12.0+cu113
+args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
+Model name = gpt2-large
+[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
+[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
+[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
+[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+NCCL version 2.10.3+cuda11.3
+[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:22,130] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
+Building extension module utils...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module utils...
+Time to load utils op: 0.3399326801300049 seconds
+[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
+test_save -- 10.13 GB,  6.83 secs,  1.48 gb/s
+*********************************************
+[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:30,659] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "mock", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0004949569702148438 seconds
+[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
+test_ds_mock_save --  0.00 GB,  0.93 secs,  0.00 gb/s
+*********************************************
+[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:33,250] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "python", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.000392913818359375 seconds
+[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196}
+test_ds_py_save -- 10.13 GB,  6.32 secs,  1.60 gb/s
+*********************************************
+[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
+Building extension module async_io...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module async_io...
+Time to load async_io op: 0.4869067668914795 seconds
+[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:41,332] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "fast", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0004849433898925781 seconds
+[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0003745555877685547 seconds
+stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0}
+test_ds_fast_save -- 10.13 GB,  3.00 secs,  3.38 gb/s
+*********************************************
diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
new file mode 100644
index 000000000..9871b634e
--- /dev/null
+++ b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
@@ -0,0 +1,781 @@
+Performance test of deepspeed integration of fast model checkpointing.
+torch version = 1.12.0+cu113
+args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2)
+Model name = gpt2-large
+[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
+[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
+[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
+[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+NCCL version 2.10.3+cuda11.3
+[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam...
+Detected CUDA files, patching ldflags
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja...
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o 
+[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o 
+[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so
+Loading extension module fused_adam...
+Time to load fused_adam op: 19.252447843551636 seconds
+[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
+Building extension module utils...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module utils...
+Time to load utils op: 0.3341379165649414 seconds
+[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
+[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.58 GB, percent = 1.3%
+[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
+[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
+[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
+[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
+[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
+[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.53 GB, percent = 1.3%
+[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
+[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.4 GB, percent = 1.3%
+[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 7.38 GB         Max_CA 7 GB 
+[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 7.38 GB         Max_CA 7 GB 
+[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 16.04 GB         Max_CA 16 GB 
+[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 16.04 GB         Max_CA 16 GB 
+[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:45:48,972] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0004029273986816406 seconds
+[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
+[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py
+[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_save -- 10.13 GB,  7.51 secs,  1.35 gb/s
+*********************************************
+[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module fused_adam, skipping build step...
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.0011363029479980469 seconds
+[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.00023317337036132812 seconds
+[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 16 GB 
+[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.55 GB, percent = 1.3%
+[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.54 GB, percent = 1.3%
+[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.51 GB, percent = 1.3%
+[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
+[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
+[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
+[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.38 GB, percent = 1.3%
+[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
+[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:45:57,937] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "mock", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.000377655029296875 seconds
+[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
+stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
+[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py
+[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_ds_mock_save --  0.00 GB,  2.02 secs,  0.00 gb/s
+*********************************************
+[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module fused_adam, skipping build step...
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.001192331314086914 seconds
+[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0002560615539550781 seconds
+[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
+[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.63 GB, percent = 1.3%
+[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
+[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
+[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
+[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:46:02,176] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "python", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0003757476806640625 seconds
+[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764}
+stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899}
+[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py
+[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_ds_py_save -- 10.13 GB,  7.23 secs,  1.40 gb/s
+*********************************************
+[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module fused_adam, skipping build step...
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.0010247230529785156 seconds
+[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0002410411834716797 seconds
+[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
+[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.6 GB, percent = 1.3%
+[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.59 GB, percent = 1.3%
+[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
+[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
+[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
+Building extension module async_io...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module async_io...
+Time to load async_io op: 0.5492517948150635 seconds
+[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:46:13,143] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "fast", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.00046539306640625 seconds
+[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0002307891845703125 seconds
+stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0003643035888671875 seconds
+stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0}
+[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py
+[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_ds_fast_save -- 10.13 GB,  3.94 secs,  2.57 gb/s
+*********************************************

From e4817a1f8c9a8e1bda5618ce34465ce5adf9875c Mon Sep 17 00:00:00 2001
From: GuanhuaWang <guanhua@cs.berkeley.edu>
Date: Thu, 22 Sep 2022 01:23:54 +0000
Subject: [PATCH 19/40] add torch* error log with half flag but without fused
 flag

---
 .../log_9_21_22/torch_star_half_error.txt           | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt

diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
new file mode 100644
index 000000000..8d06a1011
--- /dev/null
+++ b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
@@ -0,0 +1,13 @@
+Performance test of deepspeed integration of fast model checkpointing.
+torch version = 1.12.0+cu113
+args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
+Model name = gpt2-large
+[2022-09-22 01:22:52,520] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3
+[2022-09-22 01:22:52,524] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
+[2022-09-22 01:22:53,396] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
+[2022-09-22 01:22:53,397] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2022-09-22 01:22:53,400] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead
+NCCL version 2.10.3+cuda11.3
+[2022-09-22 01:22:56,452] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2022-09-22 01:22:56,454] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-22 01:22:56,482] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__}

From b297e1776f8b9a266cc18852fd331e811b31f422 Mon Sep 17 00:00:00 2001
From: GuanhuaWang <guanhua@cs.berkeley.edu>
Date: Thu, 22 Sep 2022 01:31:01 +0000
Subject: [PATCH 20/40] log for error

---
 .../log_9_21_22/torch_star_half_error.txt     | 75 +++++++++++++++++--
 1 file changed, 67 insertions(+), 8 deletions(-)

diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
index 8d06a1011..5a5292f6e 100644
--- a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
+++ b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
@@ -2,12 +2,71 @@ Performance test of deepspeed integration of fast model checkpointing.
 torch version = 1.12.0+cu113
 args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
 Model name = gpt2-large
-[2022-09-22 01:22:52,520] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3
-[2022-09-22 01:22:52,524] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
-[2022-09-22 01:22:53,396] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
-[2022-09-22 01:22:53,397] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
-[2022-09-22 01:22:53,400] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead
+[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3
+[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
+--------------------------------------------------------------------------
+WARNING: No preset parameters were found for the device that Open MPI
+detected:
+
+  Local host:            azwuse57c00009D
+  Device name:           mlx5_ib0
+  Device vendor ID:      0x02c9
+  Device vendor part ID: 4124
+
+Default device parameters will be used, which may result in lower
+performance.  You can edit any of the files specified by the
+btl_openib_device_param_files MCA parameter to set values for your
+device.
+
+NOTE: You can turn off this warning by setting the MCA parameter
+      btl_openib_warn_no_device_params_found to 0.
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+By default, for Open MPI 4.0 and later, infiniband ports on a device
+are not used by default.  The intent is to use UCX for these devices.
+You can override this policy by setting the btl_openib_allow_ib MCA parameter
+to true.
+
+  Local host:              azwuse57c00009D
+  Local adapter:           mlx5_ib0
+  Local port:              1
+
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+WARNING: There was an error initializing an OpenFabrics device.
+
+  Local host:   azwuse57c00009D
+  Local device: mlx5_ib4
+--------------------------------------------------------------------------
+[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
+[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead
 NCCL version 2.10.3+cuda11.3
-[2022-09-22 01:22:56,452] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
-[2022-09-22 01:22:56,454] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-22 01:22:56,482] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__}
+[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__}
+Traceback (most recent call last):
+  File "deepspeed_save_model.py", line 133, in <module>
+    main()
+  File "deepspeed_save_model.py", line 129, in main
+    run(model, model_name, ckpt_name, args)
+  File "deepspeed_save_model.py", line 106, in run
+    write_sec = test_save(tag, folder, model, args, writer_type)
+  File "deepspeed_save_model.py", line 76, in test_save
+    ds_engine = _get_ds_engine(model, ds_config)
+  File "deepspeed_save_model.py", line 52, in _get_ds_engine
+    ds_engine, _, _, _ = deepspeed.initialize(
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize
+    engine = DeepSpeedEngine(args=args,
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__
+    self._configure_optimizer(optimizer, model_parameters)
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer
+    self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer
+    or self.fp16_fused_mode() \
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode
+    return self._config.fp16_fused_mode
+AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode'
+[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found
+[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
+[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected

From f05dab111f49e8f94f3163793e88fd197becb645 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 5 Oct 2022 17:58:55 +0000
Subject: [PATCH 21/40] local rank arg

---
 fast_io/model_checkpoint/deepspeed_save_model.py | 2 +-
 fast_io/model_checkpoint/save_model_utils.py     | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index 081fe0299..cb0ec6009 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -102,7 +102,7 @@ def run(model, model_name, ckpt_name, args):
     for tag, writer_type in writer_dict.items():
         folder = os.path.join(args.folder, ckpt_name, tag)
         if os.path.exists(folder):
-            shutil.rmtree(folder)
+            shutil.rmtree(folder, ignore_errors=True)
         write_sec = test_save(tag, folder, model, args, writer_type)
         ckpt_size = _get_folder_size(folder)
         gb_size = ckpt_size / (1024**3)
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index 24f9f87d1..02ea1942b 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -72,6 +72,11 @@ def parse_arguments():
         required=True,
         help='Hugging Face transformers tag of model (e.g., gpt2).')
 
+    parser.add_argument('--local_rank',
+                        type=int,
+                        default=0,
+                        help='Local rank' )
+
     parser.add_argument('--legacy',
                         action='store_true',
                         help='Use torch legacy save format')

From 1aa971aabf9d3a42354914576ab5fb7ee19437b4 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 5 Oct 2022 15:53:02 -0400
Subject: [PATCH 22/40] Handle local_rank arg (#78)

* save_checkpoint perf monitoring

* Disable checkpoint save on exit

* local rank arg
---
 fast_io/model_checkpoint/deepspeed_save_model.py | 2 +-
 fast_io/model_checkpoint/save_model_utils.py     | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index 081fe0299..cb0ec6009 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -102,7 +102,7 @@ def run(model, model_name, ckpt_name, args):
     for tag, writer_type in writer_dict.items():
         folder = os.path.join(args.folder, ckpt_name, tag)
         if os.path.exists(folder):
-            shutil.rmtree(folder)
+            shutil.rmtree(folder, ignore_errors=True)
         write_sec = test_save(tag, folder, model, args, writer_type)
         ckpt_size = _get_folder_size(folder)
         gb_size = ckpt_size / (1024**3)
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index 24f9f87d1..02ea1942b 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -72,6 +72,11 @@ def parse_arguments():
         required=True,
         help='Hugging Face transformers tag of model (e.g., gpt2).')
 
+    parser.add_argument('--local_rank',
+                        type=int,
+                        default=0,
+                        help='Local rank' )
+
     parser.add_argument('--legacy',
                         action='store_true',
                         help='Use torch legacy save format')

From 98b2f8a4b7c4eb4e760fabda27b12b8816c0d53f Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 5 Oct 2022 21:00:03 +0000
Subject: [PATCH 23/40] Single writer option

---
 fast_io/model_checkpoint/deepspeed_save_model.py | 1 +
 fast_io/model_checkpoint/save_model_utils.py     | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index cb0ec6009..44e1e66f5 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -43,6 +43,7 @@ def _get_ds_config(args, writer_type):
             "io_buffer_size": args.io_buffer_mb * (1024**2),
             "io_buffer_double": not args.single_io_buffer,
             "show_statistics": not args.no_statistics,
+            "data_parallel": not args.single_writer
         }
 
     return ds_config
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index 02ea1942b..af0b3d314 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -118,6 +118,9 @@ def parse_arguments():
                         action='store_true',
                         help='Disable double buffering of i/o buffer.')
 
+
+    parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args

From 2e4228518f660f7151aceafe5ade6557e91b4aed Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 5 Oct 2022 17:02:10 -0400
Subject: [PATCH 24/40] Single writer option (#79)

* save_checkpoint perf monitoring

* Disable checkpoint save on exit

* local rank arg

* Single writer option
---
 fast_io/model_checkpoint/deepspeed_save_model.py | 1 +
 fast_io/model_checkpoint/save_model_utils.py     | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index cb0ec6009..44e1e66f5 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -43,6 +43,7 @@ def _get_ds_config(args, writer_type):
             "io_buffer_size": args.io_buffer_mb * (1024**2),
             "io_buffer_double": not args.single_io_buffer,
             "show_statistics": not args.no_statistics,
+            "data_parallel": not args.single_writer
         }
 
     return ds_config
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index 02ea1942b..af0b3d314 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -118,6 +118,9 @@ def parse_arguments():
                         action='store_true',
                         help='Disable double buffering of i/o buffer.')
 
+
+    parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state')
+
     args = parser.parse_args()
     print(f'args = {args}')
     return args

From a567adf6a99f0a7c976993a50420b31f218125e9 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 12 Oct 2022 11:43:06 +0000
Subject: [PATCH 25/40] Allow missing folder

---
 fast_io/model_checkpoint/deepspeed_save_model.py | 8 +++++---
 fast_io/model_checkpoint/requirements.txt        | 1 +
 fast_io/model_checkpoint/save_model_utils.py     | 6 +++---
 3 files changed, 9 insertions(+), 6 deletions(-)
 create mode 100644 fast_io/model_checkpoint/requirements.txt

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index 44e1e66f5..ffe0ff540 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -95,15 +95,17 @@ def _get_folder_size(folder):
 def run(model, model_name, ckpt_name, args):
     print(f'Model name = {model_name}')
     writer_dict = {
-        'test_save': None,
-        'test_ds_mock_save': 'mock',
-        'test_ds_py_save': 'python',
+        # 'test_save': None,
+        # 'test_ds_mock_save': 'mock',
+        # 'test_ds_py_save': 'python',
         'test_ds_fast_save': 'fast'
     }
     for tag, writer_type in writer_dict.items():
         folder = os.path.join(args.folder, ckpt_name, tag)
         if os.path.exists(folder):
             shutil.rmtree(folder, ignore_errors=True)
+        # if not os.path.exists(folder):
+        #     os.makedirs(folder, exist_ok=True)
         write_sec = test_save(tag, folder, model, args, writer_type)
         ckpt_size = _get_folder_size(folder)
         gb_size = ckpt_size / (1024**3)
diff --git a/fast_io/model_checkpoint/requirements.txt b/fast_io/model_checkpoint/requirements.txt
new file mode 100644
index 000000000..976a2b1f3
--- /dev/null
+++ b/fast_io/model_checkpoint/requirements.txt
@@ -0,0 +1 @@
+transformers
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index af0b3d314..9ab2859f3 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -41,9 +41,9 @@ def get_model(model_tag):
 
 def validate_arguments(args):
     success = True
-    if not os.path.exists(args.folder):
-        print(f'Invalid folder: {args.folder}')
-        success = False
+    # if not os.path.exists(args.folder):
+    #     print(f'Invalid folder: {args.folder}')
+    #     success = False
 
     if not args.model in HF_MODELS:
         print(f'{args.model} is not a supported HF model tag')

From 65793bd13bcacf2df5c68ada006007a40f59d082 Mon Sep 17 00:00:00 2001
From: Tunji Ruwase <olruwase@microsoft.com>
Date: Fri, 10 Feb 2023 01:25:29 +0000
Subject: [PATCH 26/40] DP writer refactor

---
 fast_io/model_checkpoint/deepspeed_save_model.py |  6 +++---
 fast_io/model_checkpoint/save_model_utils.py     |  2 +-
 fast_io/model_checkpoint/torch_save_tensor.py    | 13 +++++++++----
 fast_io/model_checkpoint/torch_save_utils.py     | 12 ++++++++----
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index ffe0ff540..70148e6ca 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -32,8 +32,8 @@ def _get_ds_config(args, writer_type):
             "block_size": 8 * (1024**2),
             "queue_depth": 8,
             "single_submit": False,
-            "overlap_events": False,
-            "thread_count": 1,
+            "overlap_events": True,
+            "thread_count": 2,
         }
     }
 
@@ -43,7 +43,7 @@ def _get_ds_config(args, writer_type):
             "io_buffer_size": args.io_buffer_mb * (1024**2),
             "io_buffer_double": not args.single_io_buffer,
             "show_statistics": not args.no_statistics,
-            "data_parallel": not args.single_writer
+            "data_parallel": "socket" #   None # not args.single_writer
         }
 
     return ds_config
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index 9ab2859f3..4101795f6 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -119,7 +119,7 @@ def parse_arguments():
                         help='Disable double buffering of i/o buffer.')
 
 
-    parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state')
+    #parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state')
 
     args = parser.parse_args()
     print(f'args = {args}')
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 80d5f1358..386d7156b 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -4,6 +4,7 @@
 import os
 from torch_save_utils import PINNED_BUFFER_MB
 from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
+import deepspeed 
 
 
 def run(args):
@@ -14,9 +15,9 @@ def run(args):
                            device=device)
 
     fn_dict = {
-        'test_save': test_save,
-        'test_ds_mock_save': test_ds_mock_save,
-        'test_ds_py_save': test_ds_py_save,
+        # 'test_save': test_save,
+        # 'test_ds_mock_save': test_ds_mock_save,
+        # 'test_ds_py_save': test_ds_py_save,
         'test_ds_fast_save': test_ds_fast_save
     }
     for tag, fn in fn_dict.items():
@@ -64,6 +65,10 @@ def parse_arguments():
     parser.add_argument('--single_io_buffer',
                         action='store_true',
                         help='Disable double buffering of i/o buffer.')
+    parser.add_argument('--local_rank',
+                        type=int,
+                        default=0,
+                        help='Local rank' )
 
     args = parser.parse_args()
     print(f'args = {args}')
@@ -78,7 +83,7 @@ def main():
     if not os.path.exists(args.folder):
         print(f'Invalid folder: {args.folder}')
         quit()
-
+    deepspeed.init_distributed()
     run(args)
 
 
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
index c01fd014c..e274b5cda 100644
--- a/fast_io/model_checkpoint/torch_save_utils.py
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -3,7 +3,7 @@
 import os
 import deepspeed
 from deepspeed.ops.aio import AsyncIOBuilder
-from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter
+from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig
 
 AIO_QUEUE_DEPTH = 8
 AIO_BLOCK_SIZE = 8 * (1024**2)
@@ -62,10 +62,14 @@ def test_ds_fast_save(file, buffer, args):
                                 dtype=torch.uint8,
                                 device='cpu').pin_memory()
     st = time.time()
+    config = FastFileWriterConfig(aio_handle=h,
+                                  pinned_tensor=pinned_memory,
+                                  double_buffer=not args.single_io_buffer,
+                                  num_parallel_writers=1,
+                                  writer_rank=0)
+
     ds_fast_writer = FastFileWriter(file_path=file,
-                                    aio_handle=h,
-                                    pinned_tensor=pinned_memory,
-                                    double_buffer=not args.single_io_buffer)
+                                    config=config)
     torch.save(f=ds_fast_writer,
                obj=buffer,
                _use_new_zipfile_serialization=not args.legacy)

From 5bfdf04d65a66c2c42f094c9a680dc40925614ab Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 12 Feb 2025 11:59:23 -0500
Subject: [PATCH 27/40] Update for DS; Add GDS

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 fast_io/model_checkpoint/save_model_utils.py  | 54 ++++++++-----------
 fast_io/model_checkpoint/torch_save_model.py  | 13 +++--
 fast_io/model_checkpoint/torch_save_tensor.py | 13 +++--
 fast_io/model_checkpoint/torch_save_utils.py  | 45 +++++++++++++---
 4 files changed, 75 insertions(+), 50 deletions(-)

diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index 4101795f6..739e31124 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -5,47 +5,35 @@
 from torch_save_utils import PINNED_BUFFER_MB
 
 
-def _get_gpt_j_6B(tag):
-    model_name = "EleutherAI/gpt-j-6B"
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    ckpt_name = "gpt-j-6B"
-    return model, model_name, ckpt_name
-
-
-def _get_tiny_t5(tag):
-    model_name = "hf-internal-testing/tiny-random-t5"
-    model = T5ForConditionalGeneration.from_pretrained(model_name)
-    ckpt_name = "tiny-random-t5"
-    return model, model_name, ckpt_name
-
-
-def _get_hf_gpt2(tag):
-    model_name = tag
-    model = AutoModelForCausalLM.from_pretrained(tag)
-    ckpt_name = tag
-    return model, model_name, ckpt_name
-
-
-HF_MODELS = {
-    'tiny-t5': _get_tiny_t5,
-    'gpt-j-6B': _get_gpt_j_6B,
-    'gpt2': _get_hf_gpt2,
-    'gpt2-large': _get_hf_gpt2,
-    'gpt2-xl': _get_hf_gpt2,
+TINY_T5 = 'tiny-t5'
+PHI3_MINI = 'phi3'
+PHI3_VISION = 'phi3-v'
+LLAMA3_1B = 'llama3-1B'
+
+HF_MODELS_DICT = {
+    TINY_T5: "hf-internal-testing/tiny-random-t5",
+    PHI3_MINI: "microsoft/Phi-3.5-mini-instruct",
+    PHI3_VISION: "microsoft/Phi-3.5-vision-instruct",
+    LLAMA3_1B: "meta-llama/Llama-3.2-1B",
 }
 
+def _get_hf_model(tag):
+    model_name = HF_MODELS_DICT[tag]
+    if tag == TINY_T5:
+        model = T5ForConditionalGeneration.from_pretrained(model_name)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+
+    return model, model_name, tag
 
 def get_model(model_tag):
-    return HF_MODELS[model_tag](model_tag)
+    return _get_hf_model(model_tag)
 
 
 def validate_arguments(args):
     success = True
-    # if not os.path.exists(args.folder):
-    #     print(f'Invalid folder: {args.folder}')
-    #     success = False
 
-    if not args.model in HF_MODELS:
+    if not args.model in HF_MODELS_DICT:
         print(f'{args.model} is not a supported HF model tag')
         success = False
 
@@ -70,7 +58,7 @@ def parse_arguments():
         default=None,
         type=str,
         required=True,
-        help='Hugging Face transformers tag of model (e.g., gpt2).')
+        help=f'Hugging Face transformers tag of model. Available models = {list(HF_MODELS_DICT.keys())}')
 
     parser.add_argument('--local_rank',
                         type=int,
diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index 245d49e30..a489ee927 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -2,19 +2,22 @@
 import torch
 from torch.optim import Adam
 import os
-from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
+from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
 from save_model_utils import get_model, validate_arguments, parse_arguments
 
 
 def run(model, model_name, ckpt_name, args):
     print(f'Model name = {model_name}')
     fn_dict = {
-        'test_save': test_save,
-        'test_ds_mock_save': test_ds_mock_save,
-        'test_ds_py_save': test_ds_py_save,
-        'test_ds_fast_save': test_ds_fast_save
+        # 'test_save': test_save,
+        # 'test_ds_mock_save': test_ds_mock_save,
+        # 'test_ds_py_save': test_ds_py_save,
+        # 'test_ds_aio_fast_save': test_ds_aio_fast_save,
+        'test_ds_gds_fast_save': test_ds_gds_fast_save
     }
     for tag, fn in fn_dict.items():
+        if tag == 'test_ds_gds_fast_save' and not args.gpu:
+            continue 
         file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt')
         print(f'checkpoint file = {file}')
         if os.path.isfile(file):
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index 386d7156b..accbcd8b2 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -3,7 +3,7 @@
 import torch
 import os
 from torch_save_utils import PINNED_BUFFER_MB
-from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save
+from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
 import deepspeed 
 
 
@@ -15,12 +15,15 @@ def run(args):
                            device=device)
 
     fn_dict = {
-        # 'test_save': test_save,
-        # 'test_ds_mock_save': test_ds_mock_save,
-        # 'test_ds_py_save': test_ds_py_save,
-        'test_ds_fast_save': test_ds_fast_save
+        'test_save': test_save,
+        'test_ds_mock_save': test_ds_mock_save,
+        'test_ds_py_save': test_ds_py_save,
+        'test_ds_aio_fast_save': test_ds_aio_fast_save,
+        'test_ds_gds_fast_save': test_ds_gds_fast_save
     }
     for tag, fn in fn_dict.items():
+        if tag == 'test_ds_gds_fast_save' and not args.gpu:
+            continue 
         file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt')
         print(f'checkpoint file = {file}')
         if os.path.isfile(file):
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
index e274b5cda..166ce4582 100644
--- a/fast_io/model_checkpoint/torch_save_utils.py
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -2,12 +2,13 @@
 import torch
 import os
 import deepspeed
-from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import AsyncIOBuilder, GDSBuilder
 from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig
+from deepspeed.accelerator import get_accelerator
 
 AIO_QUEUE_DEPTH = 8
 AIO_BLOCK_SIZE = 8 * (1024**2)
-AIO_THREAD_COUNT = 1
+AIO_INTRA_OP_PARALLEL = 1
 AIO_SINGLE_SUBMIT = False
 AIO_OVERLAP_EVENTS = False
 PINNED_BUFFER_MB = 64
@@ -18,9 +19,16 @@ def _get_aio_handle():
                                            queue_depth=AIO_QUEUE_DEPTH,
                                            single_submit=AIO_SINGLE_SUBMIT,
                                            overlap_events=AIO_SINGLE_SUBMIT,
-                                           num_threads=AIO_THREAD_COUNT)
+                                           intra_op_parallelism=AIO_INTRA_OP_PARALLEL)
     return h
 
+def _get_gds_handle():
+    h = GDSBuilder().load().gds_handle(block_size=AIO_BLOCK_SIZE,
+                                    queue_depth=AIO_QUEUE_DEPTH,
+                                    single_submit=AIO_SINGLE_SUBMIT,
+                                    overlap_events=AIO_SINGLE_SUBMIT,
+                                    intra_op_parallelism=AIO_INTRA_OP_PARALLEL)
+    return h
 
 def test_save(file, buffer, args):
     st = time.time()
@@ -55,21 +63,37 @@ def test_ds_py_save(file, buffer, args):
         ds_py_writer._dump_state()
     return write_sec
 
-
-def test_ds_fast_save(file, buffer, args):
+def _get_aio_components(args):
     h = _get_aio_handle()
     pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2),
                                 dtype=torch.uint8,
                                 device='cpu').pin_memory()
+    return h, pinned_memory
+
+def _get_gds_components(args):
+    h = _get_gds_handle()
+    pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), 
+                                dtype=torch.uint8, 
+                                device=get_accelerator().device_name())
+    h.pin_device_tensor(pinned_memory)
+    return h, pinned_memory
+
+
+
+def _test_ds_fast_save(file, buffer, args, use_gds):
+    if use_gds:
+        h, pinned_memory = _get_gds_components(args)
+    else:
+        h, pinned_memory = _get_aio_components(args)
     st = time.time()
-    config = FastFileWriterConfig(aio_handle=h,
+    fast_writer_config = FastFileWriterConfig(aio_handle=h,
                                   pinned_tensor=pinned_memory,
                                   double_buffer=not args.single_io_buffer,
                                   num_parallel_writers=1,
                                   writer_rank=0)
 
     ds_fast_writer = FastFileWriter(file_path=file,
-                                    config=config)
+                                    config=fast_writer_config)
     torch.save(f=ds_fast_writer,
                obj=buffer,
                _use_new_zipfile_serialization=not args.legacy)
@@ -78,3 +102,10 @@ def test_ds_fast_save(file, buffer, args):
     if not args.no_statistics:
         ds_fast_writer._dump_state()
     return write_sec
+
+
+def test_ds_aio_fast_save(file, buffer, args):
+    return _test_ds_fast_save(file, buffer, args, False)
+
+def test_ds_gds_fast_save(file, buffer, args):
+    return _test_ds_fast_save(file, buffer, args, True)

From 9a27914d5660b46a01765c8e50f74964b80fdbde Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 20 Feb 2025 07:17:04 -0500
Subject: [PATCH 28/40] Integrate GDS into deepspeed_model_save

---
 .../model_checkpoint/deepspeed_save_model.py  | 23 +++++++++++--------
 fast_io/model_checkpoint/save_model_utils.py  |  4 +++-
 fast_io/model_checkpoint/torch_save_model.py  | 17 +++++++-------
 fast_io/model_checkpoint/torch_save_tensor.py | 11 +++++----
 fast_io/model_checkpoint/torch_save_utils.py  |  8 +++----
 5 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
index 70148e6ca..ea97dd717 100644
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -6,10 +6,10 @@
 import random
 import numpy as np
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from save_model_utils import get_model, validate_arguments, parse_arguments
 
-
-def _get_ds_config(args, writer_type):
+def _get_ds_config(args, writer_type, use_gds):
     ds_config = {
         "train_micro_batch_size_per_gpu": 1,
         "zero_optimization": {
@@ -33,7 +33,8 @@ def _get_ds_config(args, writer_type):
             "queue_depth": 8,
             "single_submit": False,
             "overlap_events": True,
-            "thread_count": 2,
+            "intra_op_parallelism": 2,
+            "use_gds": use_gds,
         }
     }
 
@@ -69,11 +70,12 @@ def _free_ds_memory(ds_engine):
     ds_engine = None
     del ds_engine
     gc.collect()
-    torch.cuda.empty_cache()
+    get_accelerator().empty_cache()
 
 
 def test_save(tag, folder, model, args, writer_type):
-    ds_config = _get_ds_config(args, writer_type)
+    use_gds = writer_type == 'fast' and 'gds' in tag
+    ds_config = _get_ds_config(args, writer_type, use_gds)
     ds_engine = _get_ds_engine(model, ds_config)
     if args.zero_stage == 0:
         _do_optimizer_step(ds_engine)
@@ -95,10 +97,11 @@ def _get_folder_size(folder):
 def run(model, model_name, ckpt_name, args):
     print(f'Model name = {model_name}')
     writer_dict = {
-        # 'test_save': None,
-        # 'test_ds_mock_save': 'mock',
-        # 'test_ds_py_save': 'python',
-        'test_ds_fast_save': 'fast'
+        'test_save': None,
+        'test_ds_mock_save': 'mock',
+        'test_ds_py_save': 'python',
+        'test_ds_aio_fast_save': 'fast',
+        'test_ds_gds_fast_save': 'fast',
     }
     for tag, writer_type in writer_dict.items():
         folder = os.path.join(args.folder, ckpt_name, tag)
@@ -111,7 +114,7 @@ def run(model, model_name, ckpt_name, args):
         gb_size = ckpt_size / (1024**3)
         gb_per_sec = gb_size / write_sec
         print(
-            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s'
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
         )
         print(f'*********************************************')
 
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
index 739e31124..faf4fc5d8 100644
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -5,6 +5,7 @@
 from torch_save_utils import PINNED_BUFFER_MB
 
 
+GPT2L = 'gpt2-large'
 TINY_T5 = 'tiny-t5'
 PHI3_MINI = 'phi3'
 PHI3_VISION = 'phi3-v'
@@ -12,6 +13,7 @@
 
 HF_MODELS_DICT = {
     TINY_T5: "hf-internal-testing/tiny-random-t5",
+    GPT2L: GPT2L,
     PHI3_MINI: "microsoft/Phi-3.5-mini-instruct",
     PHI3_VISION: "microsoft/Phi-3.5-vision-instruct",
     LLAMA3_1B: "meta-llama/Llama-3.2-1B",
@@ -58,7 +60,7 @@ def parse_arguments():
         default=None,
         type=str,
         required=True,
-        help=f'Hugging Face transformers tag of model. Available models = {list(HF_MODELS_DICT.keys())}')
+        help=f'HuggingFace tag of model. Available models = {list(HF_MODELS_DICT.keys())}')
 
     parser.add_argument('--local_rank',
                         type=int,
diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
index a489ee927..6c1103049 100644
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -4,16 +4,18 @@
 import os
 from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
 from save_model_utils import get_model, validate_arguments, parse_arguments
+import deepspeed
+from deepspeed.accelerator import get_accelerator
 
 
 def run(model, model_name, ckpt_name, args):
     print(f'Model name = {model_name}')
     fn_dict = {
-        # 'test_save': test_save,
-        # 'test_ds_mock_save': test_ds_mock_save,
-        # 'test_ds_py_save': test_ds_py_save,
-        # 'test_ds_aio_fast_save': test_ds_aio_fast_save,
-        'test_ds_gds_fast_save': test_ds_gds_fast_save
+        'test_save': test_save,
+        'test_ds_mock_save': test_ds_mock_save,
+        'test_ds_py_save': test_ds_py_save,
+        'test_ds_gds_fast_save': test_ds_gds_fast_save,
+        'test_ds_aio_fast_save': test_ds_aio_fast_save,
     }
     for tag, fn in fn_dict.items():
         if tag == 'test_ds_gds_fast_save' and not args.gpu:
@@ -28,14 +30,13 @@ def run(model, model_name, ckpt_name, args):
         gb_size = ckpt_size / (1024**3)
         gb_per_sec = gb_size / write_sec
         print(
-            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s'
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
         )
         print(f'*********************************************')
 
 
 def _get_initialized_optimizer(model, fused_opt):
     base_optimizer = Adam(model.parameters())
-    import deepspeed
     if fused_opt:
         from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper
     else:
@@ -62,7 +63,7 @@ def main():
     if args.half:
         model = model.half()
     if args.gpu:
-        model = model.cuda()
+        model = model.to(get_accelerator().current_device_name())
     if args.optimizer:
         optimizer = _get_initialized_optimizer(model, args.fused)
         ckpt_state = {'model': model, 'optimizer': optimizer}
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
index accbcd8b2..014fdd035 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -5,19 +5,20 @@
 from torch_save_utils import PINNED_BUFFER_MB
 from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
 import deepspeed 
+from deepspeed.accelerator import get_accelerator
 
 
 def run(args):
-    device = torch.cuda.current_device() if args.gpu else 'cpu'
+    device = get_accelerator().current_device_name() if args.gpu else 'cpu'
     buffer = torch.randint(high=128,
                            size=(args.mb_size * (1024**2), ),
                            dtype=torch.uint8,
                            device=device)
 
     fn_dict = {
-        'test_save': test_save,
-        'test_ds_mock_save': test_ds_mock_save,
-        'test_ds_py_save': test_ds_py_save,
+        # 'test_save': test_save,
+        # 'test_ds_mock_save': test_ds_mock_save,
+        # 'test_ds_py_save': test_ds_py_save,
         'test_ds_aio_fast_save': test_ds_aio_fast_save,
         'test_ds_gds_fast_save': test_ds_gds_fast_save
     }
@@ -33,7 +34,7 @@ def run(args):
         gb_per_sec = args.mb_size / (1024.0 * write_sec)
         gb_size = os.path.getsize(file) / (1024**3)
         print(
-            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s'
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
         )
         print(f'*********************************************')
 
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
index 166ce4582..cf5f2bba5 100644
--- a/fast_io/model_checkpoint/torch_save_utils.py
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -15,7 +15,7 @@
 
 
 def _get_aio_handle():
-    h = AsyncIOBuilder().load().aio_handle(block_size=AIO_BLOCK_SIZE,
+    h = AsyncIOBuilder().load(verbose=False).aio_handle(block_size=AIO_BLOCK_SIZE,
                                            queue_depth=AIO_QUEUE_DEPTH,
                                            single_submit=AIO_SINGLE_SUBMIT,
                                            overlap_events=AIO_SINGLE_SUBMIT,
@@ -23,7 +23,7 @@ def _get_aio_handle():
     return h
 
 def _get_gds_handle():
-    h = GDSBuilder().load().gds_handle(block_size=AIO_BLOCK_SIZE,
+    h = GDSBuilder().load(verbose=False).gds_handle(block_size=AIO_BLOCK_SIZE,
                                     queue_depth=AIO_QUEUE_DEPTH,
                                     single_submit=AIO_SINGLE_SUBMIT,
                                     overlap_events=AIO_SINGLE_SUBMIT,
@@ -74,7 +74,7 @@ def _get_gds_components(args):
     h = _get_gds_handle()
     pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), 
                                 dtype=torch.uint8, 
-                                device=get_accelerator().device_name())
+                                device=get_accelerator().current_device_name())
     h.pin_device_tensor(pinned_memory)
     return h, pinned_memory
 
@@ -86,7 +86,7 @@ def _test_ds_fast_save(file, buffer, args, use_gds):
     else:
         h, pinned_memory = _get_aio_components(args)
     st = time.time()
-    fast_writer_config = FastFileWriterConfig(aio_handle=h,
+    fast_writer_config = FastFileWriterConfig(dnvme_handle=h,
                                   pinned_tensor=pinned_memory,
                                   double_buffer=not args.single_io_buffer,
                                   num_parallel_writers=1,

From 515dded20cac9147407e9a368b1c179be1598c93 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 25 Feb 2025 13:07:42 -0500
Subject: [PATCH 29/40] Rebase fast persist (#184)

* Fast model checkpointing

* Support both legacy and serialized formats

* Add io_buffer_mb option

* Bug fix

* Force flush

* More model options; Refactor common codes

* --gpu option

* --half and more flexible options

* Add deepspeed.save_checkpoint()

* Free ds memory

* Improve repro

* Double I/O buffer (#56)

* Double I/O buffer (#60)

* Add checkpoint comparison (#62)

* Add checkpoint comparison

* Corrected a typo

Co-authored-by: Yang Li <yangli2@microsoft.com>

* save_checkpoint perf monitoring

* Disable checkpoint save on exit

* Perf statistics for save_checkpoint (#64)

* save_checkpoint perf monitoring

* Disable checkpoint save on exit

* add logs for a100-80

* add torch* error log with half flag but without fused flag

* log for error

* local rank arg

* Handle local_rank arg (#78)

* save_checkpoint perf monitoring

* Disable checkpoint save on exit

* local rank arg

* Single writer option

* Single writer option (#79)

* save_checkpoint perf monitoring

* Disable checkpoint save on exit

* local rank arg

* Single writer option

* Allow missing folder

* DP writer refactor

* Update for DS; Add GDS

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>

* Integrate GDS into deepspeed_model_save

---------

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: jerryyangli <jerryyangli@gmail.com>
Co-authored-by: Yang Li <yangli2@microsoft.com>
Co-authored-by: GuanhuaWang <guanhua@cs.berkeley.edu>
---
 .../model_checkpoint/checkpoint_compare.py    | 123 +++
 .../model_checkpoint/deepspeed_save_model.py  | 139 ++++
 .../log_9_21_22/gpt2-unfused.txt              | 599 ++++++++++++++
 .../log_9_21_22/gpt2_fused_z2.txt             | 781 ++++++++++++++++++
 .../log_9_21_22/torch_star_half_error.txt     |  72 ++
 fast_io/model_checkpoint/requirements.txt     |   1 +
 fast_io/model_checkpoint/save_model_utils.py  | 116 +++
 fast_io/model_checkpoint/torch_save_model.py  |  76 ++
 fast_io/model_checkpoint/torch_save_tensor.py |  95 +++
 fast_io/model_checkpoint/torch_save_utils.py  | 111 +++
 10 files changed, 2113 insertions(+)
 create mode 100644 fast_io/model_checkpoint/checkpoint_compare.py
 create mode 100644 fast_io/model_checkpoint/deepspeed_save_model.py
 create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
 create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
 create mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
 create mode 100644 fast_io/model_checkpoint/requirements.txt
 create mode 100644 fast_io/model_checkpoint/save_model_utils.py
 create mode 100644 fast_io/model_checkpoint/torch_save_model.py
 create mode 100644 fast_io/model_checkpoint/torch_save_tensor.py
 create mode 100644 fast_io/model_checkpoint/torch_save_utils.py

diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/fast_io/model_checkpoint/checkpoint_compare.py
new file mode 100644
index 000000000..cc67b61d9
--- /dev/null
+++ b/fast_io/model_checkpoint/checkpoint_compare.py
@@ -0,0 +1,123 @@
+#This script is for testing whether two checkpoints match; it prints all the differences
+
+import torch
+import os
+import sys
+import pickle
+from collections import OrderedDict
+
+exclude_key_str = {'ds_config/checkpoint/writer'}
+
+def main():
+    dir1 = sys.argv[1]
+    dir2 = sys.argv[2]
+    print ("Begin comparison")
+    print ("The first directory {}" .format(dir1))
+    print ("The second directory {}" .format(dir2))
+    print (' ')
+
+    file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))]
+    file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))]
+    common_files = []
+    
+    for f in file_list1:
+        if not (f in file_list2):
+            log_error_file_mismatch_first(f)
+        else:
+            common_files.append(f)
+    for f in file_list2:
+        if not (f in file_list1):
+            log_error_file_mismatch_second(f)
+    
+    for f in common_files:
+        full_dir1 = os.path.join(dir1, f)
+        full_dir2 = os.path.join(dir2, f)
+        print ("Begin comparison")
+        print("The first checkpoint {}" .format(full_dir1))
+        print("The second checkpoint {}" .format(full_dir2))
+        print(' ')
+        model_first = torch.load(full_dir1)
+        model_second = torch.load(full_dir2)
+        object_compare(model_first, model_second, [])
+
+
+def object_compare(model_first, model_second, key_chain):
+    if not (type(model_first) == type(model_second)):
+        log_error_value_mismatch(model_first, model_second, key_chain)
+        return
+
+    if type(model_first) is list:
+        if len(model_first) != len(model_second):
+            log_error_value_mismatch(model_first, model_second, key_chain)
+            return
+        for i in range(len(model_first)):
+            object_compare(model_first[i], model_second[i], key_chain)
+        return
+
+    if type(model_first) is dict or type(model_first) is OrderedDict:
+        common_keys = []
+        for key in model_first:
+            if key not in model_second:
+                key_chain.append(key)
+                log_error_key_mismatch_first(model_first[key], key_chain)
+                key_chain.pop()
+            else:
+                common_keys.append(key)
+                
+        for key in model_second:
+            if key not in model_first:
+                key_chain.append(key)
+                log_error_key_mismatch_second(model_second[key], key_chain) 
+                key_chain.pop()
+                
+        for key in common_keys:
+            key_chain.append(key)
+            object_compare(model_first[key], model_second[key], key_chain)
+            key_chain.pop()
+        return
+	
+    if hasattr(model_first, '__dict__'):
+        equality = (model_first.__dict__ == model_second.__dict__)
+    else:
+        equality = (model_first == model_second)
+    if type(equality) is not bool:
+        equality = (equality.all())
+    if not equality:
+        log_error_value_mismatch(model_first, model_second, key_chain)
+    return    
+
+
+def log_error_file_mismatch_first(filename):
+    print("The following file appeared in the first but not the second directory: {}" .format(filename))
+    print(' ')
+    
+
+def log_error_file_mismatch_second(filename):
+    print("The following key appeared in the second but not the first directory: {}" .format(filename))
+    print(" ")
+
+
+def log_error_key_mismatch_first(model, key_chain):
+    key_str = "/".join(key_chain)
+    if not (key_str in exclude_key_str):
+        print("The following key appeared in the first but not the second model: {}" .format(key_str))
+        print("The value of the first model is: {}" .format(model))
+        print(" ") 
+
+
+def log_error_key_mismatch_second(model, key_chain):
+    key_str = "/".join(key_chain)
+    if not (key_str in exclude_key_str):
+        print("The following key appeared in the second but not the first model: {}" .format(key_str))
+        print("The value of the second model is: {}" .format(model))
+        print(" ") 
+
+
+def log_error_value_mismatch(model_first, model_second, key_chain):
+    print ("The values of the following key do not match: {}" .format("/".join(key_chain)))
+    print ("The value of the first model is: {}" .format(model_first))
+    print ("The value of the second model is: {}" .format(model_second))
+    print(" ")
+
+if __name__ == "__main__":
+    main()
diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
new file mode 100644
index 000000000..ea97dd717
--- /dev/null
+++ b/fast_io/model_checkpoint/deepspeed_save_model.py
@@ -0,0 +1,139 @@
+import time
+import torch
+import os
+import shutil
+import gc
+import random
+import numpy as np
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from save_model_utils import get_model, validate_arguments, parse_arguments
+
+def _get_ds_config(args, writer_type, use_gds):
+    ds_config = {
+        "train_micro_batch_size_per_gpu": 1,
+        "zero_optimization": {
+            "stage": args.zero_stage,
+            "cpu_offload": args.cpu_offload
+        },
+        "fp16": {
+            "enabled": args.half
+        },
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "torch_adam": not args.fused
+            }
+        },
+        "checkpoint": {
+            "checkpoint_serialization": not args.legacy
+        },
+        "aio": {
+            "block_size": 8 * (1024**2),
+            "queue_depth": 8,
+            "single_submit": False,
+            "overlap_events": True,
+            "intra_op_parallelism": 2,
+            "use_gds": use_gds,
+        }
+    }
+
+    if writer_type:
+        ds_config["checkpoint"]["writer"] = {
+            "type": writer_type,
+            "io_buffer_size": args.io_buffer_mb * (1024**2),
+            "io_buffer_double": not args.single_io_buffer,
+            "show_statistics": not args.no_statistics,
+            "data_parallel": "socket" #   None # not args.single_writer
+        }
+
+    return ds_config
+
+
+def _get_ds_engine(model, ds_config):
+    ds_engine, _, _, _ = deepspeed.initialize(
+        model=model, model_parameters=model.parameters(), config=ds_config)
+
+    return ds_engine
+
+
+def _do_optimizer_step(ds_engine):
+    for p in ds_engine.module.parameters():
+        p.grad = torch.zeros_like(p)
+    ds_engine.step()
+
+
+def _free_ds_memory(ds_engine):
+    ds_engine.optimizer.optimizer = None
+    ds_engine.optimizer = None
+    ds_engine.module = None
+    ds_engine = None
+    del ds_engine
+    gc.collect()
+    get_accelerator().empty_cache()
+
+
+def test_save(tag, folder, model, args, writer_type):
+    use_gds = writer_type == 'fast' and 'gds' in tag
+    ds_config = _get_ds_config(args, writer_type, use_gds)
+    ds_engine = _get_ds_engine(model, ds_config)
+    if args.zero_stage == 0:
+        _do_optimizer_step(ds_engine)
+
+    st = time.time()
+    ds_engine.save_checkpoint(save_dir=folder, tag=tag)
+    write_sec = time.time() - st
+    _free_ds_memory(ds_engine)
+    return write_sec
+
+
+def _get_folder_size(folder):
+    size = 0
+    for path, _, files in os.walk(folder):
+        size += sum([os.path.getsize(os.path.join(path, f)) for f in files])
+    return size
+
+
+def run(model, model_name, ckpt_name, args):
+    print(f'Model name = {model_name}')
+    writer_dict = {
+        'test_save': None,
+        'test_ds_mock_save': 'mock',
+        'test_ds_py_save': 'python',
+        'test_ds_aio_fast_save': 'fast',
+        'test_ds_gds_fast_save': 'fast',
+    }
+    for tag, writer_type in writer_dict.items():
+        folder = os.path.join(args.folder, ckpt_name, tag)
+        if os.path.exists(folder):
+            shutil.rmtree(folder, ignore_errors=True)
+        # if not os.path.exists(folder):
+        #     os.makedirs(folder, exist_ok=True)
+        write_sec = test_save(tag, folder, model, args, writer_type)
+        ckpt_size = _get_folder_size(folder)
+        gb_size = ckpt_size / (1024**3)
+        gb_per_sec = gb_size / write_sec
+        print(
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
+        )
+        print(f'*********************************************')
+
+
+def main():
+    print(
+        f'Performance test of deepspeed integration of fast model checkpointing.'
+    )
+    print(f'torch version = {torch.__version__}')
+    torch.manual_seed(42)
+    np.random.seed(0)
+    random.seed(0)
+    args = parse_arguments()
+    if not validate_arguments(args):
+        quit()
+
+    model, model_name, ckpt_name = get_model(args.model)
+    run(model, model_name, ckpt_name, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
new file mode 100644
index 000000000..33985e8db
--- /dev/null
+++ b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
@@ -0,0 +1,599 @@
+Performance test of deepspeed integration of fast model checkpointing.
+torch version = 1.12.0+cu113
+args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
+Model name = gpt2-large
+[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
+[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
+[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
+[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+NCCL version 2.10.3+cuda11.3
+[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:22,130] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
+Building extension module utils...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module utils...
+Time to load utils op: 0.3399326801300049 seconds
+[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
+test_save -- 10.13 GB,  6.83 secs,  1.48 gb/s
+*********************************************
+[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:30,659] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "mock", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0004949569702148438 seconds
+[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
+test_ds_mock_save --  0.00 GB,  0.93 secs,  0.00 gb/s
+*********************************************
+[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:33,250] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "python", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.000392913818359375 seconds
+[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196}
+test_ds_py_save -- 10.13 GB,  6.32 secs,  1.60 gb/s
+*********************************************
+[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
+[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
+[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
+Building extension module async_io...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module async_io...
+Time to load async_io op: 0.4869067668914795 seconds
+[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 0, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_enabled ................. False
+[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
+[2022-09-21 18:42:41,332] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 0, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "fast", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0004849433898925781 seconds
+[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0003745555877685547 seconds
+stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0}
+test_ds_fast_save -- 10.13 GB,  3.00 secs,  3.38 gb/s
+*********************************************
diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
new file mode 100644
index 000000000..9871b634e
--- /dev/null
+++ b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
@@ -0,0 +1,781 @@
+Performance test of deepspeed integration of fast model checkpointing.
+torch version = 1.12.0+cu113
+args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2)
+Model name = gpt2-large
+[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
+[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
+[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
+[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+NCCL version 2.10.3+cuda11.3
+[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam...
+Detected CUDA files, patching ldflags
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja...
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o 
+[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o 
+[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so
+Loading extension module fused_adam...
+Time to load fused_adam op: 19.252447843551636 seconds
+[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
+Building extension module utils...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module utils...
+Time to load utils op: 0.3341379165649414 seconds
+[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
+[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.58 GB, percent = 1.3%
+[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
+[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
+[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
+[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
+[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
+[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.53 GB, percent = 1.3%
+[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
+[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.4 GB, percent = 1.3%
+[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 7.38 GB         Max_CA 7 GB 
+[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 7.38 GB         Max_CA 7 GB 
+[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 16.04 GB         Max_CA 16 GB 
+[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 16.04 GB         Max_CA 16 GB 
+[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:45:48,972] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0004029273986816406 seconds
+[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
+[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py
+[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_save -- 10.13 GB,  7.51 secs,  1.35 gb/s
+*********************************************
+[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module fused_adam, skipping build step...
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.0011363029479980469 seconds
+[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.00023317337036132812 seconds
+[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 16 GB 
+[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.55 GB, percent = 1.3%
+[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.54 GB, percent = 1.3%
+[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.51 GB, percent = 1.3%
+[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
+[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
+[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
+[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
+[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.38 GB, percent = 1.3%
+[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
+[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:45:57,937] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "mock", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.000377655029296875 seconds
+[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
+stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
+[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py
+[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_ds_mock_save --  0.00 GB,  2.02 secs,  0.00 gb/s
+*********************************************
+[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module fused_adam, skipping build step...
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.001192331314086914 seconds
+[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0002560615539550781 seconds
+[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
+[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.63 GB, percent = 1.3%
+[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
+[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
+[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
+[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
+[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:46:02,176] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "python", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0003757476806640625 seconds
+[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
+stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764}
+stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899}
+[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py
+[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_ds_py_save -- 10.13 GB,  7.23 secs,  1.40 gb/s
+*********************************************
+[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
+[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
+[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module fused_adam, skipping build step...
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.0010247230529785156 seconds
+[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
+Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
+[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
+[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
+[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0002410411834716797 seconds
+[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
+[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
+[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.6 GB, percent = 1.3%
+[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
+[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.59 GB, percent = 1.3%
+[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
+[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
+[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
+[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
+[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
+[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
+[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
+[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
+[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
+[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
+[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized
+[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
+[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
+[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
+[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
+[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
+[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
+[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
+Building extension module async_io...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module async_io...
+Time to load async_io op: 0.5492517948150635 seconds
+[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_enabled .................. False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_params ................... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   disable_allgather ............ False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dump_state ................... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   elasticity_enabled ........... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_enabled ................. True
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   global_rank .................. 0
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
+[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   loss_scale ................... 0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   memory_breakdown ............. False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_name ............... adam
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_params ............. {}
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_enabled .................. False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_params ................... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   prescale_gradients ........... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_groups .............. 1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_offset .............. 1000
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_period .............. 1000
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_rounding ............ 0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_training_enabled .... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_type ................ 0
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_verbose ............. False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_name ............... None
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_params ............. None
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_attention ............. None
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   steps_per_print .............. 10
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_batch_size ............. 1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   world_size ................... 1
+[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
+[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_config .................. {
+    "stage": 2, 
+    "contiguous_gradients": false, 
+    "reduce_scatter": true, 
+    "reduce_bucket_size": 5.000000e+08, 
+    "allgather_partitions": true, 
+    "allgather_bucket_size": 5.000000e+08, 
+    "overlap_comm": false, 
+    "load_from_fp32_weights": true, 
+    "elastic_checkpoint": true, 
+    "offload_param": null, 
+    "offload_optimizer": null, 
+    "sub_group_size": 1.000000e+12, 
+    "prefetch_bucket_size": 5.000000e+07, 
+    "param_persistence_threshold": 1.000000e+05, 
+    "max_live_parameters": 1.000000e+09, 
+    "max_reuse_distance": 1.000000e+09, 
+    "gather_fp16_weights_on_model_save": false, 
+    "ignore_unused_parameters": true, 
+    "legacy_stage1": false
+}
+[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_enabled ................. True
+[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
+[2022-09-21 18:46:13,143] [INFO] [config.py:888:print]   json = {
+    "train_micro_batch_size_per_gpu": 1, 
+    "zero_optimization": {
+        "stage": 2, 
+        "cpu_offload": false
+    }, 
+    "fp16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "Adam", 
+        "params": {
+        }
+    }, 
+    "checkpoint": {
+        "checkpoint_serialization": false, 
+        "writer": {
+            "type": "fast", 
+            "io_buffer_size": 1.073742e+09, 
+            "io_buffer_double": false, 
+            "show_statistics": true
+        }
+    }, 
+    "aio": {
+        "block_size": 8.388608e+06, 
+        "queue_depth": 8, 
+        "single_submit": false, 
+        "overlap_events": false, 
+        "thread_count": 1
+    }
+}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.00046539306640625 seconds
+[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0002307891845703125 seconds
+stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0}
+Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
+No modifications detected for re-loaded extension module utils, skipping build step...
+Loading extension module utils...
+Time to load utils op: 0.0003643035888671875 seconds
+stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0}
+[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py
+[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
+test_ds_fast_save -- 10.13 GB,  3.94 secs,  2.57 gb/s
+*********************************************
diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
new file mode 100644
index 000000000..5a5292f6e
--- /dev/null
+++ b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
@@ -0,0 +1,72 @@
+Performance test of deepspeed integration of fast model checkpointing.
+torch version = 1.12.0+cu113
+args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
+Model name = gpt2-large
+[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3
+[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
+--------------------------------------------------------------------------
+WARNING: No preset parameters were found for the device that Open MPI
+detected:
+
+  Local host:            azwuse57c00009D
+  Device name:           mlx5_ib0
+  Device vendor ID:      0x02c9
+  Device vendor part ID: 4124
+
+Default device parameters will be used, which may result in lower
+performance.  You can edit any of the files specified by the
+btl_openib_device_param_files MCA parameter to set values for your
+device.
+
+NOTE: You can turn off this warning by setting the MCA parameter
+      btl_openib_warn_no_device_params_found to 0.
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+By default, for Open MPI 4.0 and later, infiniband ports on a device
+are not used by default.  The intent is to use UCX for these devices.
+You can override this policy by setting the btl_openib_allow_ib MCA parameter
+to true.
+
+  Local host:              azwuse57c00009D
+  Local adapter:           mlx5_ib0
+  Local port:              1
+
+--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+WARNING: There was an error initializing an OpenFabrics device.
+
+  Local host:   azwuse57c00009D
+  Local device: mlx5_ib4
+--------------------------------------------------------------------------
+[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
+[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead
+NCCL version 2.10.3+cuda11.3
+[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer
+[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__}
+Traceback (most recent call last):
+  File "deepspeed_save_model.py", line 133, in <module>
+    main()
+  File "deepspeed_save_model.py", line 129, in main
+    run(model, model_name, ckpt_name, args)
+  File "deepspeed_save_model.py", line 106, in run
+    write_sec = test_save(tag, folder, model, args, writer_type)
+  File "deepspeed_save_model.py", line 76, in test_save
+    ds_engine = _get_ds_engine(model, ds_config)
+  File "deepspeed_save_model.py", line 52, in _get_ds_engine
+    ds_engine, _, _, _ = deepspeed.initialize(
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize
+    engine = DeepSpeedEngine(args=args,
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__
+    self._configure_optimizer(optimizer, model_parameters)
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer
+    self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer
+    or self.fp16_fused_mode() \
+  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode
+    return self._config.fp16_fused_mode
+AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode'
+[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found
+[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
+[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected
diff --git a/fast_io/model_checkpoint/requirements.txt b/fast_io/model_checkpoint/requirements.txt
new file mode 100644
index 000000000..976a2b1f3
--- /dev/null
+++ b/fast_io/model_checkpoint/requirements.txt
@@ -0,0 +1 @@
+transformers
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
new file mode 100644
index 000000000..faf4fc5d8
--- /dev/null
+++ b/fast_io/model_checkpoint/save_model_utils.py
@@ -0,0 +1,116 @@
+import argparse
+import os
+from transformers import AutoModelForCausalLM
+from transformers import T5ForConditionalGeneration
+from torch_save_utils import PINNED_BUFFER_MB
+
+
+GPT2L = 'gpt2-large'
+TINY_T5 = 'tiny-t5'
+PHI3_MINI = 'phi3'
+PHI3_VISION = 'phi3-v'
+LLAMA3_1B = 'llama3-1B'
+
+HF_MODELS_DICT = {
+    TINY_T5: "hf-internal-testing/tiny-random-t5",
+    GPT2L: GPT2L,
+    PHI3_MINI: "microsoft/Phi-3.5-mini-instruct",
+    PHI3_VISION: "microsoft/Phi-3.5-vision-instruct",
+    LLAMA3_1B: "meta-llama/Llama-3.2-1B",
+}
+
+def _get_hf_model(tag):
+    model_name = HF_MODELS_DICT[tag]
+    if tag == TINY_T5:
+        model = T5ForConditionalGeneration.from_pretrained(model_name)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+
+    return model, model_name, tag
+
+def get_model(model_tag):
+    return _get_hf_model(model_tag)
+
+
+def validate_arguments(args):
+    success = True
+
+    if not args.model in HF_MODELS_DICT:
+        print(f'{args.model} is not a supported HF model tag')
+        success = False
+
+    if args.optimizer and args.half:
+        if not args.gpu:
+            print(f'mixed precision only supported with gpu tensors')
+            success = False
+
+    return success
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='Folder to use for I/O.')
+
+    parser.add_argument(
+        '--model',
+        default=None,
+        type=str,
+        required=True,
+        help=f'HuggingFace tag of model. Available models = {list(HF_MODELS_DICT.keys())}')
+
+    parser.add_argument('--local_rank',
+                        type=int,
+                        default=0,
+                        help='Local rank' )
+
+    parser.add_argument('--legacy',
+                        action='store_true',
+                        help='Use torch legacy save format')
+
+    parser.add_argument('--optimizer',
+                        action='store_true',
+                        help='Include optimizer state in checkpoint.')
+
+    parser.add_argument('--fused',
+                        action='store_true',
+                        help='Use fused fp16 optimizer.')
+
+    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
+
+    parser.add_argument('--half',
+                        action='store_true',
+                        help='Use half-precision tensors.')
+
+    parser.add_argument(
+        '--io_buffer_mb',
+        type=int,
+        default=PINNED_BUFFER_MB,
+        help=f'Size of pinned i/o buffer in MB. Default = {PINNED_BUFFER_MB}')
+
+    parser.add_argument('--zero_stage',
+                        type=int,
+                        default=0,
+                        help='ZeRO optimization stage. Default = 0')
+
+    parser.add_argument('--cpu_offload',
+                        action='store_true',
+                        help='Enable CPU offload of optimizer state.')
+
+    parser.add_argument('--no-statistics',
+                        action='store_true',
+                        help='Suppress low-level performance statistics.')
+
+    parser.add_argument('--single_io_buffer',
+                        action='store_true',
+                        help='Disable double buffering of i/o buffer.')
+
+
+    #parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
new file mode 100644
index 000000000..6c1103049
--- /dev/null
+++ b/fast_io/model_checkpoint/torch_save_model.py
@@ -0,0 +1,76 @@
+import time
+import torch
+from torch.optim import Adam
+import os
+from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
+from save_model_utils import get_model, validate_arguments, parse_arguments
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+
+
+def run(model, model_name, ckpt_name, args):
+    print(f'Model name = {model_name}')
+    fn_dict = {
+        'test_save': test_save,
+        'test_ds_mock_save': test_ds_mock_save,
+        'test_ds_py_save': test_ds_py_save,
+        'test_ds_gds_fast_save': test_ds_gds_fast_save,
+        'test_ds_aio_fast_save': test_ds_aio_fast_save,
+    }
+    for tag, fn in fn_dict.items():
+        if tag == 'test_ds_gds_fast_save' and not args.gpu:
+            continue 
+        file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt')
+        print(f'checkpoint file = {file}')
+        if os.path.isfile(file):
+            os.remove(file)
+        st = time.time()
+        write_sec = fn(file, model, args)
+        ckpt_size = os.path.getsize(file)
+        gb_size = ckpt_size / (1024**3)
+        gb_per_sec = gb_size / write_sec
+        print(
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
+        )
+        print(f'*********************************************')
+
+
+def _get_initialized_optimizer(model, fused_opt):
+    base_optimizer = Adam(model.parameters())
+    if fused_opt:
+        from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper
+    else:
+        from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper
+    optimizer = FP16_Wrapper(base_optimizer)
+    for p in model.parameters():
+        p.grad = torch.zeros_like(p)
+    optimizer.step()
+    return optimizer
+
+
+def main():
+    print(
+        f'Performance test of torch.save() integration of fast model checkpointing.'
+    )
+    print(f'torch version = {torch.__version__}')
+    torch.manual_seed(42)
+
+    args = parse_arguments()
+    if not validate_arguments(args):
+        quit()
+
+    model, model_name, ckpt_name = get_model(args.model)
+    if args.half:
+        model = model.half()
+    if args.gpu:
+        model = model.to(get_accelerator().current_device_name())
+    if args.optimizer:
+        optimizer = _get_initialized_optimizer(model, args.fused)
+        ckpt_state = {'model': model, 'optimizer': optimizer}
+    else:
+        ckpt_state = {'model': model}
+    run(ckpt_state, model_name, ckpt_name, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
new file mode 100644
index 000000000..014fdd035
--- /dev/null
+++ b/fast_io/model_checkpoint/torch_save_tensor.py
@@ -0,0 +1,95 @@
+import time
+import argparse
+import torch
+import os
+from torch_save_utils import PINNED_BUFFER_MB
+from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
+import deepspeed 
+from deepspeed.accelerator import get_accelerator
+
+
+def run(args):
+    device = get_accelerator().current_device_name() if args.gpu else 'cpu'
+    buffer = torch.randint(high=128,
+                           size=(args.mb_size * (1024**2), ),
+                           dtype=torch.uint8,
+                           device=device)
+
+    fn_dict = {
+        # 'test_save': test_save,
+        # 'test_ds_mock_save': test_ds_mock_save,
+        # 'test_ds_py_save': test_ds_py_save,
+        'test_ds_aio_fast_save': test_ds_aio_fast_save,
+        'test_ds_gds_fast_save': test_ds_gds_fast_save
+    }
+    for tag, fn in fn_dict.items():
+        if tag == 'test_ds_gds_fast_save' and not args.gpu:
+            continue 
+        file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt')
+        print(f'checkpoint file = {file}')
+        if os.path.isfile(file):
+            os.remove(file)
+        st = time.time()
+        write_sec = fn(file, buffer, args)
+        gb_per_sec = args.mb_size / (1024.0 * write_sec)
+        gb_size = os.path.getsize(file) / (1024**3)
+        print(
+            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
+        )
+        print(f'*********************************************')
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder',
+                        default=None,
+                        type=str,
+                        required=True,
+                        help='Folder to use for I/O.')
+    parser.add_argument('--mb_size',
+                        type=int,
+                        default=None,
+                        required=True,
+                        help='Size of tensor to save in MB.')
+    parser.add_argument('--legacy',
+                        action='store_true',
+                        help='Use torch legacy save format')
+
+    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
+
+    parser.add_argument('--io_buffer_mb',
+                        type=int,
+                        default=PINNED_BUFFER_MB,
+                        help='Size of pinned i/o buffer in MB.')
+
+    parser.add_argument('--no-statistics',
+                        action='store_true',
+                        help='Suppress low-level performance statistics.')
+
+    parser.add_argument('--single_io_buffer',
+                        action='store_true',
+                        help='Disable double buffering of i/o buffer.')
+    parser.add_argument('--local_rank',
+                        type=int,
+                        default=0,
+                        help='Local rank' )
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def main():
+    print(
+        f'Performance test of torch.save() integration of fast tensor checkpointing.'
+    )
+    args = parse_arguments()
+    if not os.path.exists(args.folder):
+        print(f'Invalid folder: {args.folder}')
+        quit()
+    deepspeed.init_distributed()
+    run(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
new file mode 100644
index 000000000..cf5f2bba5
--- /dev/null
+++ b/fast_io/model_checkpoint/torch_save_utils.py
@@ -0,0 +1,111 @@
+import time
+import torch
+import os
+import deepspeed
+from deepspeed.ops.op_builder import AsyncIOBuilder, GDSBuilder
+from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig
+from deepspeed.accelerator import get_accelerator
+
+AIO_QUEUE_DEPTH = 8
+AIO_BLOCK_SIZE = 8 * (1024**2)
+AIO_INTRA_OP_PARALLEL = 1
+AIO_SINGLE_SUBMIT = False
+AIO_OVERLAP_EVENTS = False
+PINNED_BUFFER_MB = 64
+
+
+def _get_aio_handle():
+    h = AsyncIOBuilder().load(verbose=False).aio_handle(block_size=AIO_BLOCK_SIZE,
+                                           queue_depth=AIO_QUEUE_DEPTH,
+                                           single_submit=AIO_SINGLE_SUBMIT,
+                                           overlap_events=AIO_SINGLE_SUBMIT,
+                                           intra_op_parallelism=AIO_INTRA_OP_PARALLEL)
+    return h
+
+def _get_gds_handle():
+    h = GDSBuilder().load(verbose=False).gds_handle(block_size=AIO_BLOCK_SIZE,
+                                    queue_depth=AIO_QUEUE_DEPTH,
+                                    single_submit=AIO_SINGLE_SUBMIT,
+                                    overlap_events=AIO_SINGLE_SUBMIT,
+                                    intra_op_parallelism=AIO_INTRA_OP_PARALLEL)
+    return h
+
+def test_save(file, buffer, args):
+    st = time.time()
+    torch.save(f=file,
+               obj=buffer,
+               _use_new_zipfile_serialization=not args.legacy)
+    return time.time() - st
+
+
+def test_ds_mock_save(file, buffer, args):
+    st = time.time()
+    ds_mock_writer = MockFileWriter(file)
+    torch.save(f=ds_mock_writer,
+               obj=buffer,
+               _use_new_zipfile_serialization=not args.legacy)
+    ds_mock_writer.close()  # Force flush to storage
+    write_sec = time.time() - st
+    if not args.no_statistics:
+        ds_mock_writer._dump_state()
+    return write_sec
+
+
+def test_ds_py_save(file, buffer, args):
+    st = time.time()
+    ds_py_writer = PyFileWriter(file)
+    torch.save(f=ds_py_writer,
+               obj=buffer,
+               _use_new_zipfile_serialization=not args.legacy)
+    ds_py_writer.close()  # Force flush to storage
+    write_sec = time.time() - st
+    if not args.no_statistics:
+        ds_py_writer._dump_state()
+    return write_sec
+
+def _get_aio_components(args):
+    h = _get_aio_handle()
+    pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2),
+                                dtype=torch.uint8,
+                                device='cpu').pin_memory()
+    return h, pinned_memory
+
+def _get_gds_components(args):
+    h = _get_gds_handle()
+    pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), 
+                                dtype=torch.uint8, 
+                                device=get_accelerator().current_device_name())
+    h.pin_device_tensor(pinned_memory)
+    return h, pinned_memory
+
+
+
+def _test_ds_fast_save(file, buffer, args, use_gds):
+    if use_gds:
+        h, pinned_memory = _get_gds_components(args)
+    else:
+        h, pinned_memory = _get_aio_components(args)
+    st = time.time()
+    fast_writer_config = FastFileWriterConfig(dnvme_handle=h,
+                                  pinned_tensor=pinned_memory,
+                                  double_buffer=not args.single_io_buffer,
+                                  num_parallel_writers=1,
+                                  writer_rank=0)
+
+    ds_fast_writer = FastFileWriter(file_path=file,
+                                    config=fast_writer_config)
+    torch.save(f=ds_fast_writer,
+               obj=buffer,
+               _use_new_zipfile_serialization=not args.legacy)
+    ds_fast_writer.close()  # Force flush to storage
+    write_sec = time.time() - st
+    if not args.no_statistics:
+        ds_fast_writer._dump_state()
+    return write_sec
+
+
+def test_ds_aio_fast_save(file, buffer, args):
+    return _test_ds_fast_save(file, buffer, args, False)
+
+def test_ds_gds_fast_save(file, buffer, args):
+    return _test_ds_fast_save(file, buffer, args, True)

From d01aa278c7676f16b17c2ba2ee4a1a46b431f3a3 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 26 Mar 2025 09:36:09 -0400
Subject: [PATCH 30/40] Move folder

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepnvme/file_access/aio_load_cpu_tensor.py   |  11 +-
 deepnvme/file_access/aio_store_cpu_tensor.py  |  10 +-
 .../model_checkpoint/checkpoint_compare.py    |   0
 .../model_checkpoint/deepspeed_save_model.py  |   0
 deepnvme/model_checkpoint/local_cufile.json   |   1 +
 .../model_checkpoint/requirements.txt         |   0
 .../model_checkpoint/save_model_utils.py      |   0
 .../model_checkpoint/torch_save_model.py      |   0
 .../model_checkpoint/torch_save_tensor.py     |  52 +-
 .../model_checkpoint/torch_save_utils.py      |   0
 .../log_9_21_22/gpt2-unfused.txt              | 599 --------------
 .../log_9_21_22/gpt2_fused_z2.txt             | 781 ------------------
 .../log_9_21_22/torch_star_half_error.txt     |  72 --
 13 files changed, 68 insertions(+), 1458 deletions(-)
 rename {fast_io => deepnvme}/model_checkpoint/checkpoint_compare.py (100%)
 rename {fast_io => deepnvme}/model_checkpoint/deepspeed_save_model.py (100%)
 create mode 100644 deepnvme/model_checkpoint/local_cufile.json
 rename {fast_io => deepnvme}/model_checkpoint/requirements.txt (100%)
 rename {fast_io => deepnvme}/model_checkpoint/save_model_utils.py (100%)
 rename {fast_io => deepnvme}/model_checkpoint/torch_save_model.py (100%)
 rename {fast_io => deepnvme}/model_checkpoint/torch_save_tensor.py (65%)
 rename {fast_io => deepnvme}/model_checkpoint/torch_save_utils.py (100%)
 delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
 delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
 delete mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt

diff --git a/deepnvme/file_access/aio_load_cpu_tensor.py b/deepnvme/file_access/aio_load_cpu_tensor.py
index 27a1e61c5..d6f767231 100644
--- a/deepnvme/file_access/aio_load_cpu_tensor.py
+++ b/deepnvme/file_access/aio_load_cpu_tensor.py
@@ -2,6 +2,7 @@
 import os, timeit, functools
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from utils import parse_read_arguments, GIGA_UNIT
+from deepspeed.accelerator import get_accelerator
 
 def file_read(inp_f, handle, bounce_buffer):
     handle.sync_pread(bounce_buffer, inp_f)
@@ -14,7 +15,12 @@ def main():
     cnt = args.loop
 
     aio_handle = AsyncIOBuilder().load().aio_handle()
-    bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory()
+    native_locked_tensor = get_accelerator()._name == 'cpu'
+
+    if native_locked_tensor:
+        bounce_buffer = aio_handle.new_cpu_locked_tensor(file_sz, torch.Tensor().to(torch.uint8))
+    else:
+        bounce_buffer = torch.empty(file_sz, dtype=torch.uint8).pin_memory()
 
     t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer))
     aio_t = t.timeit(cnt)
@@ -27,5 +33,8 @@ def main():
         py_tensor = py_file_read(input_file)
         print(f'Validation success = {aio_tensor.equal(py_tensor)}')
 
+    if native_locked_tensor:
+        aio_handle.free_cpu_locked_tensor(bounce_buffer)
+
 if __name__ == "__main__":
     main()
diff --git a/deepnvme/file_access/aio_store_cpu_tensor.py b/deepnvme/file_access/aio_store_cpu_tensor.py
index 20c03792b..5cdd6f68b 100644
--- a/deepnvme/file_access/aio_store_cpu_tensor.py
+++ b/deepnvme/file_access/aio_store_cpu_tensor.py
@@ -2,6 +2,7 @@
 import os, timeit, functools, pathlib
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from utils import parse_write_arguments, GIGA_UNIT
+from deepspeed.accelerator import get_accelerator
 
 def file_write(out_f, tensor, handle, bounce_buffer):
     bounce_buffer.copy_(tensor)
@@ -14,9 +15,13 @@ def main():
     pathlib.Path(output_file).unlink(missing_ok=True)
     file_sz = args.mb_size*(1024**2)
     app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False)
+    native_locked_tensor = get_accelerator()._name == 'cpu'
 
     aio_handle = AsyncIOBuilder().load().aio_handle()
-    bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
+    if native_locked_tensor:
+        bounce_buffer = aio_handle.new_cpu_locked_tensor(file_sz, torch.Tensor().to(torch.uint8))
+    else:
+        bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory()
 
 
     t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer))
@@ -33,6 +38,9 @@ def main():
         filecmp.clear_cache()
         print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }')
 
+    if native_locked_tensor:
+        aio_handle.free_cpu_locked_tensor(bounce_buffer)
+
     pathlib.Path(output_file).unlink(missing_ok=True)
 
 
diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/deepnvme/model_checkpoint/checkpoint_compare.py
similarity index 100%
rename from fast_io/model_checkpoint/checkpoint_compare.py
rename to deepnvme/model_checkpoint/checkpoint_compare.py
diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/deepnvme/model_checkpoint/deepspeed_save_model.py
similarity index 100%
rename from fast_io/model_checkpoint/deepspeed_save_model.py
rename to deepnvme/model_checkpoint/deepspeed_save_model.py
diff --git a/deepnvme/model_checkpoint/local_cufile.json b/deepnvme/model_checkpoint/local_cufile.json
new file mode 100644
index 000000000..7d4d9c8e3
--- /dev/null
+++ b/deepnvme/model_checkpoint/local_cufile.json
@@ -0,0 +1 @@
+{"execution": {"max_io_queue_depth": 8, "max_request_parallelism": 1, "max_io_threads": 1, "parallel_io": true, "min_io_threshold_size_kb": 8192}}
\ No newline at end of file
diff --git a/fast_io/model_checkpoint/requirements.txt b/deepnvme/model_checkpoint/requirements.txt
similarity index 100%
rename from fast_io/model_checkpoint/requirements.txt
rename to deepnvme/model_checkpoint/requirements.txt
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/deepnvme/model_checkpoint/save_model_utils.py
similarity index 100%
rename from fast_io/model_checkpoint/save_model_utils.py
rename to deepnvme/model_checkpoint/save_model_utils.py
diff --git a/fast_io/model_checkpoint/torch_save_model.py b/deepnvme/model_checkpoint/torch_save_model.py
similarity index 100%
rename from fast_io/model_checkpoint/torch_save_model.py
rename to deepnvme/model_checkpoint/torch_save_model.py
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/deepnvme/model_checkpoint/torch_save_tensor.py
similarity index 65%
rename from fast_io/model_checkpoint/torch_save_tensor.py
rename to deepnvme/model_checkpoint/torch_save_tensor.py
index 014fdd035..381deebde 100644
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ b/deepnvme/model_checkpoint/torch_save_tensor.py
@@ -6,9 +6,51 @@
 from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
 import deepspeed 
 from deepspeed.accelerator import get_accelerator
+import deepspeed.comm as dist
+import multiprocessing as mp
+import os 
 
+FUNC_DICT = {
+    # 'test_save': test_save,
+    # 'test_ds_mock_save': test_ds_mock_save,
+    # 'test_ds_py_save': test_ds_py_save,
+    'test_ds_gds_fast_save': test_ds_gds_fast_save,
+    # 'test_ds_aio_fast_save': test_ds_aio_fast_save,
+}
 
 def run(args):
+
+    for tag, fn in FUNC_DICT.items():
+        if tag == 'test_ds_gds_fast_save' and not args.gpu:
+            continue 
+        print(f"launching {tag=} from {os.getpid()=}")
+        mp.set_start_method('spawn', force=True)
+        run_save_method(tag, args)
+
+
+def run_save_method(tag, args):
+    print(f"running {tag=} from {os.getpid()=}")
+    device = get_accelerator().current_device_name() if args.gpu else 'cpu'
+    buffer = torch.randint(high=128,
+                           size=(args.mb_size * (1024**2), ),
+                           dtype=torch.uint8,
+                           device=device)
+   
+    file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt')
+    print(f'checkpoint file = {file}')
+    if os.path.isfile(file):
+        os.remove(file)
+    st = time.time()
+    write_sec = FUNC_DICT[tag](file, buffer, args)
+    gb_per_sec = args.mb_size / (1024.0 * write_sec)
+    gb_size = os.path.getsize(file) / (1024**3)
+    print(
+        f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
+    )
+    print(f'*********************************************')
+
+
+def old_run(args):
     device = get_accelerator().current_device_name() if args.gpu else 'cpu'
     buffer = torch.randint(high=128,
                            size=(args.mb_size * (1024**2), ),
@@ -16,10 +58,10 @@ def run(args):
                            device=device)
 
     fn_dict = {
-        # 'test_save': test_save,
-        # 'test_ds_mock_save': test_ds_mock_save,
-        # 'test_ds_py_save': test_ds_py_save,
-        'test_ds_aio_fast_save': test_ds_aio_fast_save,
+        'test_save': test_save,
+        'test_ds_mock_save': test_ds_mock_save,
+        'test_ds_py_save': test_ds_py_save,
+        # 'test_ds_aio_fast_save': test_ds_aio_fast_save,
         'test_ds_gds_fast_save': test_ds_gds_fast_save
     }
     for tag, fn in fn_dict.items():
@@ -39,6 +81,7 @@ def run(args):
         print(f'*********************************************')
 
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--folder',
@@ -89,6 +132,7 @@ def main():
         quit()
     deepspeed.init_distributed()
     run(args)
+    dist.destroy_process_group()
 
 
 if __name__ == "__main__":
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/deepnvme/model_checkpoint/torch_save_utils.py
similarity index 100%
rename from fast_io/model_checkpoint/torch_save_utils.py
rename to deepnvme/model_checkpoint/torch_save_utils.py
diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
deleted file mode 100644
index 33985e8db..000000000
--- a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
+++ /dev/null
@@ -1,599 +0,0 @@
-Performance test of deepspeed integration of fast model checkpointing.
-torch version = 1.12.0+cu113
-args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
-Model name = gpt2-large
-[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
-[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
-[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
-[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-NCCL version 2.10.3+cuda11.3
-[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:22,130] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
-Building extension module utils...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module utils...
-Time to load utils op: 0.3399326801300049 seconds
-[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
-test_save -- 10.13 GB,  6.83 secs,  1.48 gb/s
-*********************************************
-[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:30,659] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "mock", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0004949569702148438 seconds
-[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
-test_ds_mock_save --  0.00 GB,  0.93 secs,  0.00 gb/s
-*********************************************
-[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:33,250] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "python", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.000392913818359375 seconds
-[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196}
-test_ds_py_save -- 10.13 GB,  6.32 secs,  1.60 gb/s
-*********************************************
-[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
-Building extension module async_io...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module async_io...
-Time to load async_io op: 0.4869067668914795 seconds
-[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:41,332] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "fast", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0004849433898925781 seconds
-[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0003745555877685547 seconds
-stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0}
-test_ds_fast_save -- 10.13 GB,  3.00 secs,  3.38 gb/s
-*********************************************
diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
deleted file mode 100644
index 9871b634e..000000000
--- a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
+++ /dev/null
@@ -1,781 +0,0 @@
-Performance test of deepspeed integration of fast model checkpointing.
-torch version = 1.12.0+cu113
-args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2)
-Model name = gpt2-large
-[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
-[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
-[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
-[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-NCCL version 2.10.3+cuda11.3
-[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam...
-Detected CUDA files, patching ldflags
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja...
-Building extension module fused_adam...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o 
-[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o 
-[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so
-Loading extension module fused_adam...
-Time to load fused_adam op: 19.252447843551636 seconds
-[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
-Building extension module utils...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module utils...
-Time to load utils op: 0.3341379165649414 seconds
-[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
-[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.58 GB, percent = 1.3%
-[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
-[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
-[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
-[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
-[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
-[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.53 GB, percent = 1.3%
-[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
-[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.4 GB, percent = 1.3%
-[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 7.38 GB         Max_CA 7 GB 
-[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 7.38 GB         Max_CA 7 GB 
-[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 16.04 GB         Max_CA 16 GB 
-[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 16.04 GB         Max_CA 16 GB 
-[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:45:48,972] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0004029273986816406 seconds
-[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
-[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py
-[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_save -- 10.13 GB,  7.51 secs,  1.35 gb/s
-*********************************************
-[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module fused_adam, skipping build step...
-Loading extension module fused_adam...
-Time to load fused_adam op: 0.0011363029479980469 seconds
-[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.00023317337036132812 seconds
-[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 16 GB 
-[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.55 GB, percent = 1.3%
-[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.54 GB, percent = 1.3%
-[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.51 GB, percent = 1.3%
-[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
-[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
-[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
-[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.38 GB, percent = 1.3%
-[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
-[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:45:57,937] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "mock", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.000377655029296875 seconds
-[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
-stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
-[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py
-[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_ds_mock_save --  0.00 GB,  2.02 secs,  0.00 gb/s
-*********************************************
-[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module fused_adam, skipping build step...
-Loading extension module fused_adam...
-Time to load fused_adam op: 0.001192331314086914 seconds
-[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0002560615539550781 seconds
-[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
-[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.63 GB, percent = 1.3%
-[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
-[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
-[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
-[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:46:02,176] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "python", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0003757476806640625 seconds
-[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764}
-stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899}
-[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py
-[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_ds_py_save -- 10.13 GB,  7.23 secs,  1.40 gb/s
-*********************************************
-[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module fused_adam, skipping build step...
-Loading extension module fused_adam...
-Time to load fused_adam op: 0.0010247230529785156 seconds
-[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0002410411834716797 seconds
-[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
-[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.6 GB, percent = 1.3%
-[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.59 GB, percent = 1.3%
-[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
-[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
-[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
-Building extension module async_io...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module async_io...
-Time to load async_io op: 0.5492517948150635 seconds
-[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:46:13,143] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "fast", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.00046539306640625 seconds
-[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0002307891845703125 seconds
-stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0003643035888671875 seconds
-stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0}
-[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py
-[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_ds_fast_save -- 10.13 GB,  3.94 secs,  2.57 gb/s
-*********************************************
diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
deleted file mode 100644
index 5a5292f6e..000000000
--- a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-Performance test of deepspeed integration of fast model checkpointing.
-torch version = 1.12.0+cu113
-args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
-Model name = gpt2-large
-[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3
-[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
---------------------------------------------------------------------------
-WARNING: No preset parameters were found for the device that Open MPI
-detected:
-
-  Local host:            azwuse57c00009D
-  Device name:           mlx5_ib0
-  Device vendor ID:      0x02c9
-  Device vendor part ID: 4124
-
-Default device parameters will be used, which may result in lower
-performance.  You can edit any of the files specified by the
-btl_openib_device_param_files MCA parameter to set values for your
-device.
-
-NOTE: You can turn off this warning by setting the MCA parameter
-      btl_openib_warn_no_device_params_found to 0.
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-By default, for Open MPI 4.0 and later, infiniband ports on a device
-are not used by default.  The intent is to use UCX for these devices.
-You can override this policy by setting the btl_openib_allow_ib MCA parameter
-to true.
-
-  Local host:              azwuse57c00009D
-  Local adapter:           mlx5_ib0
-  Local port:              1
-
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-WARNING: There was an error initializing an OpenFabrics device.
-
-  Local host:   azwuse57c00009D
-  Local device: mlx5_ib4
---------------------------------------------------------------------------
-[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
-[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
-[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead
-NCCL version 2.10.3+cuda11.3
-[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
-[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__}
-Traceback (most recent call last):
-  File "deepspeed_save_model.py", line 133, in <module>
-    main()
-  File "deepspeed_save_model.py", line 129, in main
-    run(model, model_name, ckpt_name, args)
-  File "deepspeed_save_model.py", line 106, in run
-    write_sec = test_save(tag, folder, model, args, writer_type)
-  File "deepspeed_save_model.py", line 76, in test_save
-    ds_engine = _get_ds_engine(model, ds_config)
-  File "deepspeed_save_model.py", line 52, in _get_ds_engine
-    ds_engine, _, _, _ = deepspeed.initialize(
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize
-    engine = DeepSpeedEngine(args=args,
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__
-    self._configure_optimizer(optimizer, model_parameters)
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer
-    self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer
-    or self.fp16_fused_mode() \
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode
-    return self._config.fp16_fused_mode
-AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode'
-[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found
-[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
-[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected

From 4059f805893e73450c0c41f9debe82cc15fd5df5 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 26 Mar 2025 09:46:11 -0400
Subject: [PATCH 31/40] Remove folder

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 .../model_checkpoint/checkpoint_compare.py    | 123 ---
 .../model_checkpoint/deepspeed_save_model.py  | 139 ----
 .../log_9_21_22/gpt2-unfused.txt              | 599 --------------
 .../log_9_21_22/gpt2_fused_z2.txt             | 781 ------------------
 .../log_9_21_22/torch_star_half_error.txt     |  72 --
 fast_io/model_checkpoint/requirements.txt     |   1 -
 fast_io/model_checkpoint/save_model_utils.py  | 116 ---
 fast_io/model_checkpoint/torch_save_model.py  |  76 --
 fast_io/model_checkpoint/torch_save_tensor.py |  95 ---
 fast_io/model_checkpoint/torch_save_utils.py  | 111 ---
 10 files changed, 2113 deletions(-)
 delete mode 100644 fast_io/model_checkpoint/checkpoint_compare.py
 delete mode 100644 fast_io/model_checkpoint/deepspeed_save_model.py
 delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
 delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
 delete mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
 delete mode 100644 fast_io/model_checkpoint/requirements.txt
 delete mode 100644 fast_io/model_checkpoint/save_model_utils.py
 delete mode 100644 fast_io/model_checkpoint/torch_save_model.py
 delete mode 100644 fast_io/model_checkpoint/torch_save_tensor.py
 delete mode 100644 fast_io/model_checkpoint/torch_save_utils.py

diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/fast_io/model_checkpoint/checkpoint_compare.py
deleted file mode 100644
index cc67b61d9..000000000
--- a/fast_io/model_checkpoint/checkpoint_compare.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#This script is for testing whether two checkpoints match; it prints all the differences
-
-import torch
-import os
-import sys
-import pickle
-from collections import OrderedDict
-
-exclude_key_str = {'ds_config/checkpoint/writer'}
-
-def main():
-    dir1 = sys.argv[1]
-    dir2 = sys.argv[2]
-    print ("Begin comparison")
-    print ("The first directory {}" .format(dir1))
-    print ("The second directory {}" .format(dir2))
-    print (' ')
-
-    file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))]
-    file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))]
-    common_files = []
-    
-    for f in file_list1:
-        if not (f in file_list2):
-            log_error_file_mismatch_first(f)
-        else:
-            common_files.append(f)
-    for f in file_list2:
-        if not (f in file_list1):
-            log_error_file_mismatch_second(f)
-    
-    for f in common_files:
-        full_dir1 = os.path.join(dir1, f)
-        full_dir2 = os.path.join(dir2, f)
-        print ("Begin comparison")
-        print("The first checkpoint {}" .format(full_dir1))
-        print("The second checkpoint {}" .format(full_dir2))
-        print(' ')
-        model_first = torch.load(full_dir1)
-        model_second = torch.load(full_dir2)
-        object_compare(model_first, model_second, [])
-
-
-def object_compare(model_first, model_second, key_chain):
-    if not (type(model_first) == type(model_second)):
-        log_error_value_mismatch(model_first, model_second, key_chain)
-        return
-
-    if type(model_first) is list:
-        if len(model_first) != len(model_second):
-            log_error_value_mismatch(model_first, model_second, key_chain)
-            return
-        for i in range(len(model_first)):
-            object_compare(model_first[i], model_second[i], key_chain)
-        return
-
-    if type(model_first) is dict or type(model_first) is OrderedDict:
-        common_keys = []
-        for key in model_first:
-            if key not in model_second:
-                key_chain.append(key)
-                log_error_key_mismatch_first(model_first[key], key_chain)
-                key_chain.pop()
-            else:
-                common_keys.append(key)
-                
-        for key in model_second:
-            if key not in model_first:
-                key_chain.append(key)
-                log_error_key_mismatch_second(model_second[key], key_chain) 
-                key_chain.pop()
-                
-        for key in common_keys:
-            key_chain.append(key)
-            object_compare(model_first[key], model_second[key], key_chain)
-            key_chain.pop()
-        return
-	
-    if hasattr(model_first, '__dict__'):
-        equality = (model_first.__dict__ == model_second.__dict__)
-    else:
-        equality = (model_first == model_second)
-    if type(equality) is not bool:
-        equality = (equality.all())
-    if not equality:
-        log_error_value_mismatch(model_first, model_second, key_chain)
-    return    
-
-
-def log_error_file_mismatch_first(filename):
-    print("The following file appeared in the first but not the second directory: {}" .format(filename))
-    print(' ')
-    
-
-def log_error_file_mismatch_second(filename):
-    print("The following key appeared in the second but not the first directory: {}" .format(filename))
-    print(" ")
-
-
-def log_error_key_mismatch_first(model, key_chain):
-    key_str = "/".join(key_chain)
-    if not (key_str in exclude_key_str):
-        print("The following key appeared in the first but not the second model: {}" .format(key_str))
-        print("The value of the first model is: {}" .format(model))
-        print(" ") 
-
-
-def log_error_key_mismatch_second(model, key_chain):
-    key_str = "/".join(key_chain)
-    if not (key_str in exclude_key_str):
-        print("The following key appeared in the second but not the first model: {}" .format(key_str))
-        print("The value of the second model is: {}" .format(model))
-        print(" ") 
-
-
-def log_error_value_mismatch(model_first, model_second, key_chain):
-    print ("The values of the following key do not match: {}" .format("/".join(key_chain)))
-    print ("The value of the first model is: {}" .format(model_first))
-    print ("The value of the second model is: {}" .format(model_second))
-    print(" ")
-
-if __name__ == "__main__":
-    main()
diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py
deleted file mode 100644
index ea97dd717..000000000
--- a/fast_io/model_checkpoint/deepspeed_save_model.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import time
-import torch
-import os
-import shutil
-import gc
-import random
-import numpy as np
-import deepspeed
-from deepspeed.accelerator import get_accelerator
-from save_model_utils import get_model, validate_arguments, parse_arguments
-
-def _get_ds_config(args, writer_type, use_gds):
-    ds_config = {
-        "train_micro_batch_size_per_gpu": 1,
-        "zero_optimization": {
-            "stage": args.zero_stage,
-            "cpu_offload": args.cpu_offload
-        },
-        "fp16": {
-            "enabled": args.half
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "torch_adam": not args.fused
-            }
-        },
-        "checkpoint": {
-            "checkpoint_serialization": not args.legacy
-        },
-        "aio": {
-            "block_size": 8 * (1024**2),
-            "queue_depth": 8,
-            "single_submit": False,
-            "overlap_events": True,
-            "intra_op_parallelism": 2,
-            "use_gds": use_gds,
-        }
-    }
-
-    if writer_type:
-        ds_config["checkpoint"]["writer"] = {
-            "type": writer_type,
-            "io_buffer_size": args.io_buffer_mb * (1024**2),
-            "io_buffer_double": not args.single_io_buffer,
-            "show_statistics": not args.no_statistics,
-            "data_parallel": "socket" #   None # not args.single_writer
-        }
-
-    return ds_config
-
-
-def _get_ds_engine(model, ds_config):
-    ds_engine, _, _, _ = deepspeed.initialize(
-        model=model, model_parameters=model.parameters(), config=ds_config)
-
-    return ds_engine
-
-
-def _do_optimizer_step(ds_engine):
-    for p in ds_engine.module.parameters():
-        p.grad = torch.zeros_like(p)
-    ds_engine.step()
-
-
-def _free_ds_memory(ds_engine):
-    ds_engine.optimizer.optimizer = None
-    ds_engine.optimizer = None
-    ds_engine.module = None
-    ds_engine = None
-    del ds_engine
-    gc.collect()
-    get_accelerator().empty_cache()
-
-
-def test_save(tag, folder, model, args, writer_type):
-    use_gds = writer_type == 'fast' and 'gds' in tag
-    ds_config = _get_ds_config(args, writer_type, use_gds)
-    ds_engine = _get_ds_engine(model, ds_config)
-    if args.zero_stage == 0:
-        _do_optimizer_step(ds_engine)
-
-    st = time.time()
-    ds_engine.save_checkpoint(save_dir=folder, tag=tag)
-    write_sec = time.time() - st
-    _free_ds_memory(ds_engine)
-    return write_sec
-
-
-def _get_folder_size(folder):
-    size = 0
-    for path, _, files in os.walk(folder):
-        size += sum([os.path.getsize(os.path.join(path, f)) for f in files])
-    return size
-
-
-def run(model, model_name, ckpt_name, args):
-    print(f'Model name = {model_name}')
-    writer_dict = {
-        'test_save': None,
-        'test_ds_mock_save': 'mock',
-        'test_ds_py_save': 'python',
-        'test_ds_aio_fast_save': 'fast',
-        'test_ds_gds_fast_save': 'fast',
-    }
-    for tag, writer_type in writer_dict.items():
-        folder = os.path.join(args.folder, ckpt_name, tag)
-        if os.path.exists(folder):
-            shutil.rmtree(folder, ignore_errors=True)
-        # if not os.path.exists(folder):
-        #     os.makedirs(folder, exist_ok=True)
-        write_sec = test_save(tag, folder, model, args, writer_type)
-        ckpt_size = _get_folder_size(folder)
-        gb_size = ckpt_size / (1024**3)
-        gb_per_sec = gb_size / write_sec
-        print(
-            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
-        )
-        print(f'*********************************************')
-
-
-def main():
-    print(
-        f'Performance test of deepspeed integration of fast model checkpointing.'
-    )
-    print(f'torch version = {torch.__version__}')
-    torch.manual_seed(42)
-    np.random.seed(0)
-    random.seed(0)
-    args = parse_arguments()
-    if not validate_arguments(args):
-        quit()
-
-    model, model_name, ckpt_name = get_model(args.model)
-    run(model, model_name, ckpt_name, args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
deleted file mode 100644
index 33985e8db..000000000
--- a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt
+++ /dev/null
@@ -1,599 +0,0 @@
-Performance test of deepspeed integration of fast model checkpointing.
-torch version = 1.12.0+cu113
-args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
-Model name = gpt2-large
-[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
-[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
-[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
-[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-NCCL version 2.10.3+cuda11.3
-[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:22,128] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:22,129] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:22,130] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:22,130] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
-Building extension module utils...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module utils...
-Time to load utils op: 0.3399326801300049 seconds
-[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
-test_save -- 10.13 GB,  6.83 secs,  1.48 gb/s
-*********************************************
-[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:30,657] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:30,658] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:30,659] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:30,659] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "mock", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0004949569702148438 seconds
-[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
-test_ds_mock_save --  0.00 GB,  0.93 secs,  0.00 gb/s
-*********************************************
-[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:33,248] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:33,249] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:33,250] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:33,250] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "python", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.000392913818359375 seconds
-[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196}
-test_ds_py_save -- 10.13 GB,  6.32 secs,  1.60 gb/s
-*********************************************
-[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW
-[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale
-[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
-Building extension module async_io...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module async_io...
-Time to load async_io op: 0.4869067668914795 seconds
-[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:42:41,330] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:42:41,331] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 0, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_enabled ................. False
-[2022-09-21 18:42:41,332] [INFO] [config.py:886:print]   zero_optimization_stage ...... 0
-[2022-09-21 18:42:41,332] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 0, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "fast", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0004849433898925781 seconds
-[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0003745555877685547 seconds
-stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0}
-test_ds_fast_save -- 10.13 GB,  3.00 secs,  3.38 gb/s
-*********************************************
diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
deleted file mode 100644
index 9871b634e..000000000
--- a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt
+++ /dev/null
@@ -1,781 +0,0 @@
-Performance test of deepspeed integration of fast model checkpointing.
-torch version = 1.12.0+cu113
-args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2)
-Model name = gpt2-large
-[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
-[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
-[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
-[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-NCCL version 2.10.3+cuda11.3
-[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam...
-Detected CUDA files, patching ldflags
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja...
-Building extension module fused_adam...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o 
-[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o 
-[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so
-Loading extension module fused_adam...
-Time to load fused_adam op: 19.252447843551636 seconds
-[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja...
-Building extension module utils...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module utils...
-Time to load utils op: 0.3341379165649414 seconds
-[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
-[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.58 GB, percent = 1.3%
-[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.61 GB         Max_CA 2 GB 
-[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
-[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
-[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
-[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
-[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.53 GB, percent = 1.3%
-[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 3.06 GB         Max_CA 3 GB 
-[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.4 GB, percent = 1.3%
-[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 7.38 GB         Max_CA 7 GB 
-[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 7.38 GB         Max_CA 7 GB 
-[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 16.04 GB         Max_CA 16 GB 
-[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 16.04 GB         Max_CA 16 GB 
-[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:45:48,970] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:45:48,971] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:45:48,972] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:45:48,972] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0004029273986816406 seconds
-[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt
-[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py
-[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_save -- 10.13 GB,  7.51 secs,  1.35 gb/s
-*********************************************
-[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module fused_adam, skipping build step...
-Loading extension module fused_adam...
-Time to load fused_adam op: 0.0011363029479980469 seconds
-[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.00023317337036132812 seconds
-[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 16 GB 
-[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.55 GB, percent = 1.3%
-[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.54 GB, percent = 1.3%
-[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.51 GB, percent = 1.3%
-[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.52 GB, percent = 1.3%
-[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
-[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.41 GB, percent = 1.3%
-[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
-[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.38 GB, percent = 1.3%
-[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.37 GB, percent = 1.3%
-[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:45:57,935] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:45:57,936] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:45:57,937] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:45:57,937] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "mock", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.000377655029296875 seconds
-[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
-stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0}
-[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py
-[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_ds_mock_save --  0.00 GB,  2.02 secs,  0.00 gb/s
-*********************************************
-[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module fused_adam, skipping build step...
-Loading extension module fused_adam...
-Time to load fused_adam op: 0.001192331314086914 seconds
-[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0002560615539550781 seconds
-[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
-[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.63 GB, percent = 1.3%
-[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
-[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
-[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.56 GB, percent = 1.3%
-[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.44 GB, percent = 1.3%
-[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:46:02,174] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:46:02,175] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:46:02,176] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:46:02,176] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "python", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0003757476806640625 seconds
-[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt
-stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764}
-stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899}
-[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py
-[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_ds_py_save -- 10.13 GB,  7.23 secs,  1.40 gb/s
-*********************************************
-[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2
-[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
-[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module fused_adam, skipping build step...
-Loading extension module fused_adam...
-Time to load fused_adam op: 0.0010247230529785156 seconds
-[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
-Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer
-[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000
-[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000
-[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0002410411834716797 seconds
-[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU
-[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 10.13 GB         CA 1.48 GB         Max_CA 14 GB 
-[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 22.6 GB, percent = 1.3%
-[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU
-[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.59 GB, percent = 1.3%
-[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU
-[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
-[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache
-[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.58 GB, percent = 1.3%
-[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0
-[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB         Max_MA 1.48 GB         CA 1.48 GB         Max_CA 1 GB 
-[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0
-[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 5.8 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states
-[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB         Max_MA 4.36 GB         CA 5.81 GB         Max_CA 6 GB 
-[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states
-[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 13.01 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized
-[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer
-[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB         Max_MA 10.13 GB         CA 14.46 GB         Max_CA 14 GB 
-[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory:  used = 23.46 GB, percent = 1.3%
-[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam
-[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler
-[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
-[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja...
-Building extension module async_io...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module async_io...
-Time to load async_io op: 0.5492517948150635 seconds
-[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration:
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   allreduce_always_fp32 ........ False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_enabled .................. False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   amp_params ................... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   disable_allgather ............ False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dump_state ................... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   dynamic_loss_scale_args ...... None
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_enabled ........... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_gas_boundary_resolution  1
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_layer_num ......... 0
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_max_iter .......... 100
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_stability ......... 1e-06
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_tol ............... 0.01
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   eigenvalue_verbose ........... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   elasticity_enabled ........... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_enabled ................. True
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   fp16_mixed_quantize .......... False
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   global_rank .................. 0
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_accumulation_steps .. 1
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_clipping ............ 0.0
-[2022-09-21 18:46:13,141] [INFO] [config.py:886:print]   gradient_predivide_factor .... 1.0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   initial_dynamic_scale ........ 4294967296
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   loss_scale ................... 0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   memory_breakdown ............. False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_legacy_fusion ...... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_name ............... adam
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   optimizer_params ............. {}
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_enabled .................. False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   pld_params ................... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   prescale_gradients ........... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_change_rate ......... 0.001
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_groups .............. 1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_offset .............. 1000
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_period .............. 1000
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_rounding ............ 0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_start_bits .......... 16
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_target_bits ......... 8
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_training_enabled .... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_type ................ 0
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   quantize_verbose ............. False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_name ............... None
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   scheduler_params ............. None
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_attention ............. None
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   sparse_gradients_enabled ..... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   steps_per_print .............. 10
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_enabled .......... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_job_name ......... DeepSpeedJobName
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   tensorboard_output_path ...... 
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_batch_size ............. 1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   train_micro_batch_size_per_gpu  1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   use_quantizer_kernel ......... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   wall_clock_breakdown ......... False
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   world_size ................... 1
-[2022-09-21 18:46:13,142] [INFO] [config.py:886:print]   zero_allow_untested_optimizer  False
-[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_config .................. {
-    "stage": 2, 
-    "contiguous_gradients": false, 
-    "reduce_scatter": true, 
-    "reduce_bucket_size": 5.000000e+08, 
-    "allgather_partitions": true, 
-    "allgather_bucket_size": 5.000000e+08, 
-    "overlap_comm": false, 
-    "load_from_fp32_weights": true, 
-    "elastic_checkpoint": true, 
-    "offload_param": null, 
-    "offload_optimizer": null, 
-    "sub_group_size": 1.000000e+12, 
-    "prefetch_bucket_size": 5.000000e+07, 
-    "param_persistence_threshold": 1.000000e+05, 
-    "max_live_parameters": 1.000000e+09, 
-    "max_reuse_distance": 1.000000e+09, 
-    "gather_fp16_weights_on_model_save": false, 
-    "ignore_unused_parameters": true, 
-    "legacy_stage1": false
-}
-[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_enabled ................. True
-[2022-09-21 18:46:13,143] [INFO] [config.py:886:print]   zero_optimization_stage ...... 2
-[2022-09-21 18:46:13,143] [INFO] [config.py:888:print]   json = {
-    "train_micro_batch_size_per_gpu": 1, 
-    "zero_optimization": {
-        "stage": 2, 
-        "cpu_offload": false
-    }, 
-    "fp16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "Adam", 
-        "params": {
-        }
-    }, 
-    "checkpoint": {
-        "checkpoint_serialization": false, 
-        "writer": {
-            "type": "fast", 
-            "io_buffer_size": 1.073742e+09, 
-            "io_buffer_double": false, 
-            "show_statistics": true
-        }
-    }, 
-    "aio": {
-        "block_size": 8.388608e+06, 
-        "queue_depth": 8, 
-        "single_submit": false, 
-        "overlap_events": false, 
-        "thread_count": 1
-    }
-}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.00046539306640625 seconds
-[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0002307891845703125 seconds
-stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0}
-Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
-No modifications detected for re-loaded extension module utils, skipping build step...
-Loading extension module utils...
-Time to load utils op: 0.0003643035888671875 seconds
-stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0}
-[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py
-[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt
-test_ds_fast_save -- 10.13 GB,  3.94 secs,  2.57 gb/s
-*********************************************
diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
deleted file mode 100644
index 5a5292f6e..000000000
--- a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-Performance test of deepspeed integration of fast model checkpointing.
-torch version = 1.12.0+cu113
-args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0)
-Model name = gpt2-large
-[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3
-[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
---------------------------------------------------------------------------
-WARNING: No preset parameters were found for the device that Open MPI
-detected:
-
-  Local host:            azwuse57c00009D
-  Device name:           mlx5_ib0
-  Device vendor ID:      0x02c9
-  Device vendor part ID: 4124
-
-Default device parameters will be used, which may result in lower
-performance.  You can edit any of the files specified by the
-btl_openib_device_param_files MCA parameter to set values for your
-device.
-
-NOTE: You can turn off this warning by setting the MCA parameter
-      btl_openib_warn_no_device_params_found to 0.
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-By default, for Open MPI 4.0 and later, infiniband ports on a device
-are not used by default.  The intent is to use UCX for these devices.
-You can override this policy by setting the btl_openib_allow_ib MCA parameter
-to true.
-
-  Local host:              azwuse57c00009D
-  Local adapter:           mlx5_ib0
-  Local port:              1
-
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-WARNING: There was an error initializing an OpenFabrics device.
-
-  Local host:   azwuse57c00009D
-  Local device: mlx5_ib4
---------------------------------------------------------------------------
-[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500
-[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
-[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead
-NCCL version 2.10.3+cuda11.3
-[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
-[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer
-[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__}
-Traceback (most recent call last):
-  File "deepspeed_save_model.py", line 133, in <module>
-    main()
-  File "deepspeed_save_model.py", line 129, in main
-    run(model, model_name, ckpt_name, args)
-  File "deepspeed_save_model.py", line 106, in run
-    write_sec = test_save(tag, folder, model, args, writer_type)
-  File "deepspeed_save_model.py", line 76, in test_save
-    ds_engine = _get_ds_engine(model, ds_config)
-  File "deepspeed_save_model.py", line 52, in _get_ds_engine
-    ds_engine, _, _, _ = deepspeed.initialize(
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize
-    engine = DeepSpeedEngine(args=args,
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__
-    self._configure_optimizer(optimizer, model_parameters)
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer
-    self.optimizer = self._configure_fp16_optimizer(basic_optimizer)
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer
-    or self.fp16_fused_mode() \
-  File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode
-    return self._config.fp16_fused_mode
-AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode'
-[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found
-[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
-[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected
diff --git a/fast_io/model_checkpoint/requirements.txt b/fast_io/model_checkpoint/requirements.txt
deleted file mode 100644
index 976a2b1f3..000000000
--- a/fast_io/model_checkpoint/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers
diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py
deleted file mode 100644
index faf4fc5d8..000000000
--- a/fast_io/model_checkpoint/save_model_utils.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import argparse
-import os
-from transformers import AutoModelForCausalLM
-from transformers import T5ForConditionalGeneration
-from torch_save_utils import PINNED_BUFFER_MB
-
-
-GPT2L = 'gpt2-large'
-TINY_T5 = 'tiny-t5'
-PHI3_MINI = 'phi3'
-PHI3_VISION = 'phi3-v'
-LLAMA3_1B = 'llama3-1B'
-
-HF_MODELS_DICT = {
-    TINY_T5: "hf-internal-testing/tiny-random-t5",
-    GPT2L: GPT2L,
-    PHI3_MINI: "microsoft/Phi-3.5-mini-instruct",
-    PHI3_VISION: "microsoft/Phi-3.5-vision-instruct",
-    LLAMA3_1B: "meta-llama/Llama-3.2-1B",
-}
-
-def _get_hf_model(tag):
-    model_name = HF_MODELS_DICT[tag]
-    if tag == TINY_T5:
-        model = T5ForConditionalGeneration.from_pretrained(model_name)
-    else:
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-
-    return model, model_name, tag
-
-def get_model(model_tag):
-    return _get_hf_model(model_tag)
-
-
-def validate_arguments(args):
-    success = True
-
-    if not args.model in HF_MODELS_DICT:
-        print(f'{args.model} is not a supported HF model tag')
-        success = False
-
-    if args.optimizer and args.half:
-        if not args.gpu:
-            print(f'mixed precision only supported with gpu tensors')
-            success = False
-
-    return success
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--folder',
-                        default=None,
-                        type=str,
-                        required=True,
-                        help='Folder to use for I/O.')
-
-    parser.add_argument(
-        '--model',
-        default=None,
-        type=str,
-        required=True,
-        help=f'HuggingFace tag of model. Available models = {list(HF_MODELS_DICT.keys())}')
-
-    parser.add_argument('--local_rank',
-                        type=int,
-                        default=0,
-                        help='Local rank' )
-
-    parser.add_argument('--legacy',
-                        action='store_true',
-                        help='Use torch legacy save format')
-
-    parser.add_argument('--optimizer',
-                        action='store_true',
-                        help='Include optimizer state in checkpoint.')
-
-    parser.add_argument('--fused',
-                        action='store_true',
-                        help='Use fused fp16 optimizer.')
-
-    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
-
-    parser.add_argument('--half',
-                        action='store_true',
-                        help='Use half-precision tensors.')
-
-    parser.add_argument(
-        '--io_buffer_mb',
-        type=int,
-        default=PINNED_BUFFER_MB,
-        help=f'Size of pinned i/o buffer in MB. Default = {PINNED_BUFFER_MB}')
-
-    parser.add_argument('--zero_stage',
-                        type=int,
-                        default=0,
-                        help='ZeRO optimization stage. Default = 0')
-
-    parser.add_argument('--cpu_offload',
-                        action='store_true',
-                        help='Enable CPU offload of optimizer state.')
-
-    parser.add_argument('--no-statistics',
-                        action='store_true',
-                        help='Suppress low-level performance statistics.')
-
-    parser.add_argument('--single_io_buffer',
-                        action='store_true',
-                        help='Disable double buffering of i/o buffer.')
-
-
-    #parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py
deleted file mode 100644
index 6c1103049..000000000
--- a/fast_io/model_checkpoint/torch_save_model.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import time
-import torch
-from torch.optim import Adam
-import os
-from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
-from save_model_utils import get_model, validate_arguments, parse_arguments
-import deepspeed
-from deepspeed.accelerator import get_accelerator
-
-
-def run(model, model_name, ckpt_name, args):
-    print(f'Model name = {model_name}')
-    fn_dict = {
-        'test_save': test_save,
-        'test_ds_mock_save': test_ds_mock_save,
-        'test_ds_py_save': test_ds_py_save,
-        'test_ds_gds_fast_save': test_ds_gds_fast_save,
-        'test_ds_aio_fast_save': test_ds_aio_fast_save,
-    }
-    for tag, fn in fn_dict.items():
-        if tag == 'test_ds_gds_fast_save' and not args.gpu:
-            continue 
-        file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt')
-        print(f'checkpoint file = {file}')
-        if os.path.isfile(file):
-            os.remove(file)
-        st = time.time()
-        write_sec = fn(file, model, args)
-        ckpt_size = os.path.getsize(file)
-        gb_size = ckpt_size / (1024**3)
-        gb_per_sec = gb_size / write_sec
-        print(
-            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
-        )
-        print(f'*********************************************')
-
-
-def _get_initialized_optimizer(model, fused_opt):
-    base_optimizer = Adam(model.parameters())
-    if fused_opt:
-        from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper
-    else:
-        from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper
-    optimizer = FP16_Wrapper(base_optimizer)
-    for p in model.parameters():
-        p.grad = torch.zeros_like(p)
-    optimizer.step()
-    return optimizer
-
-
-def main():
-    print(
-        f'Performance test of torch.save() integration of fast model checkpointing.'
-    )
-    print(f'torch version = {torch.__version__}')
-    torch.manual_seed(42)
-
-    args = parse_arguments()
-    if not validate_arguments(args):
-        quit()
-
-    model, model_name, ckpt_name = get_model(args.model)
-    if args.half:
-        model = model.half()
-    if args.gpu:
-        model = model.to(get_accelerator().current_device_name())
-    if args.optimizer:
-        optimizer = _get_initialized_optimizer(model, args.fused)
-        ckpt_state = {'model': model, 'optimizer': optimizer}
-    else:
-        ckpt_state = {'model': model}
-    run(ckpt_state, model_name, ckpt_name, args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py
deleted file mode 100644
index 014fdd035..000000000
--- a/fast_io/model_checkpoint/torch_save_tensor.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import time
-import argparse
-import torch
-import os
-from torch_save_utils import PINNED_BUFFER_MB
-from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save
-import deepspeed 
-from deepspeed.accelerator import get_accelerator
-
-
-def run(args):
-    device = get_accelerator().current_device_name() if args.gpu else 'cpu'
-    buffer = torch.randint(high=128,
-                           size=(args.mb_size * (1024**2), ),
-                           dtype=torch.uint8,
-                           device=device)
-
-    fn_dict = {
-        # 'test_save': test_save,
-        # 'test_ds_mock_save': test_ds_mock_save,
-        # 'test_ds_py_save': test_ds_py_save,
-        'test_ds_aio_fast_save': test_ds_aio_fast_save,
-        'test_ds_gds_fast_save': test_ds_gds_fast_save
-    }
-    for tag, fn in fn_dict.items():
-        if tag == 'test_ds_gds_fast_save' and not args.gpu:
-            continue 
-        file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt')
-        print(f'checkpoint file = {file}')
-        if os.path.isfile(file):
-            os.remove(file)
-        st = time.time()
-        write_sec = fn(file, buffer, args)
-        gb_per_sec = args.mb_size / (1024.0 * write_sec)
-        gb_size = os.path.getsize(file) / (1024**3)
-        print(
-            f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
-        )
-        print(f'*********************************************')
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--folder',
-                        default=None,
-                        type=str,
-                        required=True,
-                        help='Folder to use for I/O.')
-    parser.add_argument('--mb_size',
-                        type=int,
-                        default=None,
-                        required=True,
-                        help='Size of tensor to save in MB.')
-    parser.add_argument('--legacy',
-                        action='store_true',
-                        help='Use torch legacy save format')
-
-    parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.')
-
-    parser.add_argument('--io_buffer_mb',
-                        type=int,
-                        default=PINNED_BUFFER_MB,
-                        help='Size of pinned i/o buffer in MB.')
-
-    parser.add_argument('--no-statistics',
-                        action='store_true',
-                        help='Suppress low-level performance statistics.')
-
-    parser.add_argument('--single_io_buffer',
-                        action='store_true',
-                        help='Disable double buffering of i/o buffer.')
-    parser.add_argument('--local_rank',
-                        type=int,
-                        default=0,
-                        help='Local rank' )
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def main():
-    print(
-        f'Performance test of torch.save() integration of fast tensor checkpointing.'
-    )
-    args = parse_arguments()
-    if not os.path.exists(args.folder):
-        print(f'Invalid folder: {args.folder}')
-        quit()
-    deepspeed.init_distributed()
-    run(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py
deleted file mode 100644
index cf5f2bba5..000000000
--- a/fast_io/model_checkpoint/torch_save_utils.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import time
-import torch
-import os
-import deepspeed
-from deepspeed.ops.op_builder import AsyncIOBuilder, GDSBuilder
-from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig
-from deepspeed.accelerator import get_accelerator
-
-AIO_QUEUE_DEPTH = 8
-AIO_BLOCK_SIZE = 8 * (1024**2)
-AIO_INTRA_OP_PARALLEL = 1
-AIO_SINGLE_SUBMIT = False
-AIO_OVERLAP_EVENTS = False
-PINNED_BUFFER_MB = 64
-
-
-def _get_aio_handle():
-    h = AsyncIOBuilder().load(verbose=False).aio_handle(block_size=AIO_BLOCK_SIZE,
-                                           queue_depth=AIO_QUEUE_DEPTH,
-                                           single_submit=AIO_SINGLE_SUBMIT,
-                                           overlap_events=AIO_SINGLE_SUBMIT,
-                                           intra_op_parallelism=AIO_INTRA_OP_PARALLEL)
-    return h
-
-def _get_gds_handle():
-    h = GDSBuilder().load(verbose=False).gds_handle(block_size=AIO_BLOCK_SIZE,
-                                    queue_depth=AIO_QUEUE_DEPTH,
-                                    single_submit=AIO_SINGLE_SUBMIT,
-                                    overlap_events=AIO_SINGLE_SUBMIT,
-                                    intra_op_parallelism=AIO_INTRA_OP_PARALLEL)
-    return h
-
-def test_save(file, buffer, args):
-    st = time.time()
-    torch.save(f=file,
-               obj=buffer,
-               _use_new_zipfile_serialization=not args.legacy)
-    return time.time() - st
-
-
-def test_ds_mock_save(file, buffer, args):
-    st = time.time()
-    ds_mock_writer = MockFileWriter(file)
-    torch.save(f=ds_mock_writer,
-               obj=buffer,
-               _use_new_zipfile_serialization=not args.legacy)
-    ds_mock_writer.close()  # Force flush to storage
-    write_sec = time.time() - st
-    if not args.no_statistics:
-        ds_mock_writer._dump_state()
-    return write_sec
-
-
-def test_ds_py_save(file, buffer, args):
-    st = time.time()
-    ds_py_writer = PyFileWriter(file)
-    torch.save(f=ds_py_writer,
-               obj=buffer,
-               _use_new_zipfile_serialization=not args.legacy)
-    ds_py_writer.close()  # Force flush to storage
-    write_sec = time.time() - st
-    if not args.no_statistics:
-        ds_py_writer._dump_state()
-    return write_sec
-
-def _get_aio_components(args):
-    h = _get_aio_handle()
-    pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2),
-                                dtype=torch.uint8,
-                                device='cpu').pin_memory()
-    return h, pinned_memory
-
-def _get_gds_components(args):
-    h = _get_gds_handle()
-    pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), 
-                                dtype=torch.uint8, 
-                                device=get_accelerator().current_device_name())
-    h.pin_device_tensor(pinned_memory)
-    return h, pinned_memory
-
-
-
-def _test_ds_fast_save(file, buffer, args, use_gds):
-    if use_gds:
-        h, pinned_memory = _get_gds_components(args)
-    else:
-        h, pinned_memory = _get_aio_components(args)
-    st = time.time()
-    fast_writer_config = FastFileWriterConfig(dnvme_handle=h,
-                                  pinned_tensor=pinned_memory,
-                                  double_buffer=not args.single_io_buffer,
-                                  num_parallel_writers=1,
-                                  writer_rank=0)
-
-    ds_fast_writer = FastFileWriter(file_path=file,
-                                    config=fast_writer_config)
-    torch.save(f=ds_fast_writer,
-               obj=buffer,
-               _use_new_zipfile_serialization=not args.legacy)
-    ds_fast_writer.close()  # Force flush to storage
-    write_sec = time.time() - st
-    if not args.no_statistics:
-        ds_fast_writer._dump_state()
-    return write_sec
-
-
-def test_ds_aio_fast_save(file, buffer, args):
-    return _test_ds_fast_save(file, buffer, args, False)
-
-def test_ds_gds_fast_save(file, buffer, args):
-    return _test_ds_fast_save(file, buffer, args, True)

From 1c3a54c8da1ae3f1abf5f85cf015cf5179b5cd62 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Wed, 26 Mar 2025 13:35:41 -0400
Subject: [PATCH 32/40] More cleanup

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 .../model_checkpoint/checkpoint_compare.py    | 123 ------------------
 deepnvme/model_checkpoint/torch_save_model.py |   3 +
 .../model_checkpoint/torch_save_tensor.py     |  45 +------
 3 files changed, 5 insertions(+), 166 deletions(-)
 delete mode 100644 deepnvme/model_checkpoint/checkpoint_compare.py

diff --git a/deepnvme/model_checkpoint/checkpoint_compare.py b/deepnvme/model_checkpoint/checkpoint_compare.py
deleted file mode 100644
index cc67b61d9..000000000
--- a/deepnvme/model_checkpoint/checkpoint_compare.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#This script is for testing whether two checkpoints match; it prints all the differences
-
-import torch
-import os
-import sys
-import pickle
-from collections import OrderedDict
-
-exclude_key_str = {'ds_config/checkpoint/writer'}
-
-def main():
-    dir1 = sys.argv[1]
-    dir2 = sys.argv[2]
-    print ("Begin comparison")
-    print ("The first directory {}" .format(dir1))
-    print ("The second directory {}" .format(dir2))
-    print (' ')
-
-    file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))]
-    file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))]
-    common_files = []
-    
-    for f in file_list1:
-        if not (f in file_list2):
-            log_error_file_mismatch_first(f)
-        else:
-            common_files.append(f)
-    for f in file_list2:
-        if not (f in file_list1):
-            log_error_file_mismatch_second(f)
-    
-    for f in common_files:
-        full_dir1 = os.path.join(dir1, f)
-        full_dir2 = os.path.join(dir2, f)
-        print ("Begin comparison")
-        print("The first checkpoint {}" .format(full_dir1))
-        print("The second checkpoint {}" .format(full_dir2))
-        print(' ')
-        model_first = torch.load(full_dir1)
-        model_second = torch.load(full_dir2)
-        object_compare(model_first, model_second, [])
-
-
-def object_compare(model_first, model_second, key_chain):
-    if not (type(model_first) == type(model_second)):
-        log_error_value_mismatch(model_first, model_second, key_chain)
-        return
-
-    if type(model_first) is list:
-        if len(model_first) != len(model_second):
-            log_error_value_mismatch(model_first, model_second, key_chain)
-            return
-        for i in range(len(model_first)):
-            object_compare(model_first[i], model_second[i], key_chain)
-        return
-
-    if type(model_first) is dict or type(model_first) is OrderedDict:
-        common_keys = []
-        for key in model_first:
-            if key not in model_second:
-                key_chain.append(key)
-                log_error_key_mismatch_first(model_first[key], key_chain)
-                key_chain.pop()
-            else:
-                common_keys.append(key)
-                
-        for key in model_second:
-            if key not in model_first:
-                key_chain.append(key)
-                log_error_key_mismatch_second(model_second[key], key_chain) 
-                key_chain.pop()
-                
-        for key in common_keys:
-            key_chain.append(key)
-            object_compare(model_first[key], model_second[key], key_chain)
-            key_chain.pop()
-        return
-	
-    if hasattr(model_first, '__dict__'):
-        equality = (model_first.__dict__ == model_second.__dict__)
-    else:
-        equality = (model_first == model_second)
-    if type(equality) is not bool:
-        equality = (equality.all())
-    if not equality:
-        log_error_value_mismatch(model_first, model_second, key_chain)
-    return    
-
-
-def log_error_file_mismatch_first(filename):
-    print("The following file appeared in the first but not the second directory: {}" .format(filename))
-    print(' ')
-    
-
-def log_error_file_mismatch_second(filename):
-    print("The following key appeared in the second but not the first directory: {}" .format(filename))
-    print(" ")
-
-
-def log_error_key_mismatch_first(model, key_chain):
-    key_str = "/".join(key_chain)
-    if not (key_str in exclude_key_str):
-        print("The following key appeared in the first but not the second model: {}" .format(key_str))
-        print("The value of the first model is: {}" .format(model))
-        print(" ") 
-
-
-def log_error_key_mismatch_second(model, key_chain):
-    key_str = "/".join(key_chain)
-    if not (key_str in exclude_key_str):
-        print("The following key appeared in the second but not the first model: {}" .format(key_str))
-        print("The value of the second model is: {}" .format(model))
-        print(" ") 
-
-
-def log_error_value_mismatch(model_first, model_second, key_chain):
-    print ("The values of the following key do not match: {}" .format("/".join(key_chain)))
-    print ("The value of the first model is: {}" .format(model_first))
-    print ("The value of the second model is: {}" .format(model_second))
-    print(" ")
-
-if __name__ == "__main__":
-    main()
diff --git a/deepnvme/model_checkpoint/torch_save_model.py b/deepnvme/model_checkpoint/torch_save_model.py
index 6c1103049..f37d122be 100644
--- a/deepnvme/model_checkpoint/torch_save_model.py
+++ b/deepnvme/model_checkpoint/torch_save_model.py
@@ -6,6 +6,7 @@
 from save_model_utils import get_model, validate_arguments, parse_arguments
 import deepspeed
 from deepspeed.accelerator import get_accelerator
+import deepspeed.comm as dist
 
 
 def run(model, model_name, ckpt_name, args):
@@ -59,6 +60,7 @@ def main():
     if not validate_arguments(args):
         quit()
 
+    deepspeed.init_distributed()
     model, model_name, ckpt_name = get_model(args.model)
     if args.half:
         model = model.half()
@@ -70,6 +72,7 @@ def main():
     else:
         ckpt_state = {'model': model}
     run(ckpt_state, model_name, ckpt_name, args)
+    dist.destroy_process_group()
 
 
 if __name__ == "__main__":
diff --git a/deepnvme/model_checkpoint/torch_save_tensor.py b/deepnvme/model_checkpoint/torch_save_tensor.py
index 381deebde..4c73a3b2a 100644
--- a/deepnvme/model_checkpoint/torch_save_tensor.py
+++ b/deepnvme/model_checkpoint/torch_save_tensor.py
@@ -7,50 +7,9 @@
 import deepspeed 
 from deepspeed.accelerator import get_accelerator
 import deepspeed.comm as dist
-import multiprocessing as mp
 import os 
 
-FUNC_DICT = {
-    # 'test_save': test_save,
-    # 'test_ds_mock_save': test_ds_mock_save,
-    # 'test_ds_py_save': test_ds_py_save,
-    'test_ds_gds_fast_save': test_ds_gds_fast_save,
-    # 'test_ds_aio_fast_save': test_ds_aio_fast_save,
-}
-
 def run(args):
-
-    for tag, fn in FUNC_DICT.items():
-        if tag == 'test_ds_gds_fast_save' and not args.gpu:
-            continue 
-        print(f"launching {tag=} from {os.getpid()=}")
-        mp.set_start_method('spawn', force=True)
-        run_save_method(tag, args)
-
-
-def run_save_method(tag, args):
-    print(f"running {tag=} from {os.getpid()=}")
-    device = get_accelerator().current_device_name() if args.gpu else 'cpu'
-    buffer = torch.randint(high=128,
-                           size=(args.mb_size * (1024**2), ),
-                           dtype=torch.uint8,
-                           device=device)
-   
-    file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt')
-    print(f'checkpoint file = {file}')
-    if os.path.isfile(file):
-        os.remove(file)
-    st = time.time()
-    write_sec = FUNC_DICT[tag](file, buffer, args)
-    gb_per_sec = args.mb_size / (1024.0 * write_sec)
-    gb_size = os.path.getsize(file) / (1024**3)
-    print(
-        f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
-    )
-    print(f'*********************************************')
-
-
-def old_run(args):
     device = get_accelerator().current_device_name() if args.gpu else 'cpu'
     buffer = torch.randint(high=128,
                            size=(args.mb_size * (1024**2), ),
@@ -61,8 +20,8 @@ def old_run(args):
         'test_save': test_save,
         'test_ds_mock_save': test_ds_mock_save,
         'test_ds_py_save': test_ds_py_save,
-        # 'test_ds_aio_fast_save': test_ds_aio_fast_save,
-        'test_ds_gds_fast_save': test_ds_gds_fast_save
+        'test_ds_gds_fast_save': test_ds_gds_fast_save,
+        'test_ds_aio_fast_save': test_ds_aio_fast_save,
     }
     for tag, fn in fn_dict.items():
         if tag == 'test_ds_gds_fast_save' and not args.gpu:

From 9a8540b4414c04e8f1c952ac3f93f8a8ef831ce5 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 27 Mar 2025 12:15:07 -0400
Subject: [PATCH 33/40] torch changes

Signed-off-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 .../torch/serialization_fast_v2.6.0.py        | 1979 +++++++++++++++++
 .../torch/serialization_orig_v2.6.0.py        | 1975 ++++++++++++++++
 2 files changed, 3954 insertions(+)
 create mode 100644 deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py
 create mode 100644 deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py

diff --git a/deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py b/deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py
new file mode 100644
index 000000000..27b90f0b8
--- /dev/null
+++ b/deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py
@@ -0,0 +1,1979 @@
+# mypy: allow-untyped-defs
+import copyreg
+import difflib
+import functools
+import io
+import os
+import pickle
+import re
+import shutil
+import struct
+import sys
+import tarfile
+import tempfile
+import threading
+import warnings
+from contextlib import closing, contextmanager
+from enum import Enum
+from typing import (
+    Any,
+    BinaryIO,
+    Callable,
+    cast,
+    Dict,
+    IO,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+from typing_extensions import TypeAlias, TypeIs
+
+import torch
+import torch._weights_only_unpickler as _weights_only_unpickler
+from torch._sources import get_source_lines_and_file
+from torch._utils import _import_dotted_name
+from torch.storage import _get_dtype_from_pickle_storage_type
+from torch.types import Storage
+
+
+__all__ = [
+    "SourceChangeWarning",
+    "mkdtemp",
+    "register_package",
+    "check_module_version_greater_or_equal",
+    "validate_cuda_device",
+    "validate_hpu_device",
+    "location_tag",
+    "default_restore_location",
+    "normalize_storage_type",
+    "storage_to_tensor_type",
+    "save",
+    "load",
+    "StorageType",
+    "LoadEndianness",
+    "get_crc32_options",
+    "set_crc32_options",
+    "get_default_load_endianness",
+    "set_default_load_endianness",
+    "get_default_mmap_options",
+    "set_default_mmap_options",
+    "clear_safe_globals",
+    "get_safe_globals",
+    "add_safe_globals",
+    "safe_globals",
+    "get_unsafe_globals_in_checkpoint",
+    "skip_data",
+]
+
+DEFAULT_PROTOCOL = 2
+
+LONG_SIZE = struct.Struct("=l").size
+INT_SIZE = struct.Struct("=i").size
+SHORT_SIZE = struct.Struct("=h").size
+
+MAGIC_NUMBER = 0x1950A86A20F9469CFC6C
+PROTOCOL_VERSION = 1001
+STORAGE_KEY_SEPARATOR = ","
+
+FILE_LIKE: TypeAlias = Union[str, os.PathLike, BinaryIO, IO[bytes]]
+MAP_LOCATION: TypeAlias = Optional[
+    Union[Callable[[Storage, str], Storage], torch.device, str, Dict[str, str]]
+]
+STORAGE: TypeAlias = Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage]
+
+IS_WINDOWS = sys.platform == "win32"
+
+UNSAFE_MESSAGE = (
+    "In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` "
+    "from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, "
+    "but it can result in arbitrary code execution. Do it only if you got the file from a "
+    "trusted source."
+)
+
+if not IS_WINDOWS:
+    from mmap import MAP_PRIVATE, MAP_SHARED
+else:
+    MAP_SHARED, MAP_PRIVATE = None, None  # type: ignore[assignment]
+
+
+def _default_to_weights_only(pickle_module):
+    is_fbcode = not hasattr(torch.version, "git_version")
+    return pickle_module is None and not is_fbcode
+
+
+# _serialization_tls is used to store thread local state specific to serialization
+# that needs to be propagated to other files, in particular we use this for
+# (1) map_location (needed for wrapper subclasses/third party devices to torch._utils)
+# (2) skip_data (needed for torch.Tensor.__reduce_ex__ for skip_data ctx)
+# (3) materialize_fake_tensors (needed for torch.Tensor.__reduce_ex__ for skip_data ctx)
+class _SerializationLocal(threading.local):
+    def __init__(self):
+        super().__init__()
+        self.map_location: Optional[MAP_LOCATION] = None
+        self.skip_data: bool = False
+        self.materialize_fake_tensors: bool = False
+
+
+_serialization_tls = _SerializationLocal()
+
+
+class SourceChangeWarning(Warning):
+    pass
+
+
+@contextmanager
+def mkdtemp():
+    path = tempfile.mkdtemp()
+    try:
+        yield path
+    finally:
+        shutil.rmtree(path)
+
+
+_package_registry: List[
+    Tuple[
+        int,
+        Callable[[STORAGE], Optional[str]],
+        Callable[[STORAGE, str], Optional[STORAGE]],
+    ]
+] = []
+
+
+class LoadEndianness(Enum):
+    NATIVE = 1
+    LITTLE = 2
+    BIG = 3
+
+
+_default_load_endian: Optional[LoadEndianness] = None
+
+
+def get_default_load_endianness() -> Optional[LoadEndianness]:
+    """
+    Get fallback byte order for loading files
+
+    If byteorder mark is not present in saved checkpoint,
+    this byte order is used as fallback.
+    By default, it's "native" byte order.
+
+    Returns:
+        default_load_endian: Optional[LoadEndianness]
+    """
+    return _default_load_endian
+
+
+def set_default_load_endianness(endianness):
+    """
+    Set fallback byte order for loading files
+
+    If byteorder mark is not present in saved checkpoint,
+    this byte order is used as fallback.
+    By default, it's "native" byte order.
+
+    Args:
+        endianness: the new fallback byte order
+    """
+    global _default_load_endian
+    if not isinstance(endianness, LoadEndianness) and endianness is not None:
+        raise TypeError("Invalid argument type in function set_default_load_endianness")
+    _default_load_endian = endianness
+
+
+_compute_crc32: bool = True
+
+
+def get_crc32_options() -> bool:
+    """
+    Get whether :func:`torch.save` computes and writes crc32 for each record.
+
+    Defaults to ``True``.
+    """
+    return _compute_crc32
+
+
+def set_crc32_options(compute_crc32: bool):
+    """
+    Set whether :func:`torch.save` computes and writes crc32 for each record.
+
+    .. note::
+        Setting this to ``False`` may make unzipping of the ``torch.save`` output
+        fail or warn due to corrupted CRC32. However ``torch.load`` will be
+        able to load the file.
+
+    Args:
+        compute_crc32 (bool): set crc32 compuation flag
+    """
+    global _compute_crc32
+    _compute_crc32 = compute_crc32
+
+
+_default_mmap_options: int = MAP_PRIVATE
+
+
+def get_default_mmap_options() -> int:
+    """
+    Get default mmap options for :func:`torch.load` with ``mmap=True``.
+
+    Defaults to ``mmap.MAP_PRIVATE``.
+
+
+    Returns:
+        default_mmap_options: int
+    """
+    return _default_mmap_options
+
+
+class set_default_mmap_options:
+    """
+    Context manager or function to set default mmap options for :func:`torch.load` with ``mmap=True`` to flags.
+
+    For now, only either ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED`` are supported.
+    Please open an issue if you need any other option to be added here.
+
+    .. note::
+        This feature is currently not supported for Windows.
+
+    Args:
+        flags: ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED``
+    """
+
+    def __init__(self, flags: int) -> None:
+        if IS_WINDOWS:
+            raise RuntimeError(
+                "Changing the default mmap options is currently not supported for Windows"
+            )
+        if flags != MAP_PRIVATE and flags != MAP_SHARED:
+            raise ValueError(
+                "Invalid argument in function set_default_mmap_options, "
+                f"expected mmap.MAP_PRIVATE or mmap.MAP_SHARED, but got {flags}"
+            )
+        global _default_mmap_options
+        self.prev = _default_mmap_options
+        _default_mmap_options = flags
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        global _default_mmap_options
+        _default_mmap_options = self.prev
+
+
+def clear_safe_globals() -> None:
+    """
+    Clears the list of globals that are safe for ``weights_only`` load.
+    """
+    _weights_only_unpickler._clear_safe_globals()
+
+
+def get_safe_globals() -> List[Union[Callable, Tuple[Callable, str]]]:
+    """
+    Returns the list of user-added globals that are safe for ``weights_only`` load.
+    """
+    return _weights_only_unpickler._get_safe_globals()
+
+
+def add_safe_globals(safe_globals: List[Union[Callable, Tuple[Callable, str]]]) -> None:
+    """
+    Marks the given globals as safe for ``weights_only`` load. For example, functions
+    added to this list can be called during unpickling, classes could be instantiated
+    and have state set.
+
+    Each item in the list can either be a function/class or a tuple of the form
+    (function/class, string) where string is the full path of the function/class.
+
+    Within the serialized format, each function is identified with its full
+    path as ``{__module__}.{__name__}``. When calling this API, you can provide this
+    full path that should match the one in the checkpoint otherwise the default
+    ``{fn.__module__}.{fn.__name__}`` will be used.
+
+    Args:
+        safe_globals (List[Union[Callable, Tuple[Callable, str]]]): list of globals to mark as safe
+
+    Example:
+        >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization")
+        >>> import tempfile
+        >>> class MyTensor(torch.Tensor):
+        ...     pass
+        >>> t = MyTensor(torch.randn(2, 3))
+        >>> with tempfile.NamedTemporaryFile() as f:
+        ...     torch.save(t, f.name)
+        # Running `torch.load(f.name, weights_only=True)` will fail with
+        # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default.
+        # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint.
+        ...     torch.serialization.add_safe_globals([MyTensor])
+        ...     torch.load(f.name, weights_only=True)
+        # MyTensor([[-0.5024, -1.8152, -0.5455],
+        #          [-0.8234,  2.0500, -0.3657]])
+    """
+    _weights_only_unpickler._add_safe_globals(safe_globals)
+
+
+class safe_globals(_weights_only_unpickler._safe_globals):
+    r"""Context-manager that adds certain globals as safe for ``weights_only`` load.
+
+    Args:
+        safe_globals: List of globals for weights_only load.
+
+    Example:
+        >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization")
+        >>> import tempfile
+        >>> class MyTensor(torch.Tensor):
+        ...     pass
+        >>> t = MyTensor(torch.randn(2, 3))
+        >>> with tempfile.NamedTemporaryFile() as f:
+        ...     torch.save(t, f.name)
+        # Running `torch.load(f.name, weights_only=True)` will fail with
+        # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default.
+        # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint.
+        ...     with torch.serialization.safe_globals([MyTensor]):
+        ...         torch.load(f.name, weights_only=True)
+        # MyTensor([[-0.5024, -1.8152, -0.5455],
+        #          [-0.8234,  2.0500, -0.3657]])
+        >>> assert torch.serialization.get_safe_globals() == []
+    """
+
+
+def get_unsafe_globals_in_checkpoint(f: FILE_LIKE) -> List[str]:
+    """Returns a list of strings of functions/classes in a ``torch.save`` object that are not safe for ``weights_only``.
+
+    For a given function or class ``f``, the corresponding string will be of the form
+    ``{f.__module__}.{f.__name__}``.
+
+    This function will return any GLOBALs in the checkpoint that are not in the set marked safe
+    for ``weights_only`` (either via :func:`add_safe_globals` or :class:`safe_globals` context or
+    allowlisted by ``torch`` by default).
+
+    .. note::
+        This function will statically disassemble the pickle file in the checkpoint.
+        The implication is any classes dynamically pushed onto the stack during unpickling
+        will not be included in the output.
+
+    Args:
+        f: File-like object or string containing the checkpoint object saved via ``torch.save``
+
+    Returns:
+        A list of strings of pickle GLOBALs in the checkpoint that are not allowlisted for ``weights_only``.
+    """
+    default_safe_globals_strings = set(
+        _weights_only_unpickler._get_allowed_globals().keys()
+    )
+    user_safe_global_strings = set(
+        _weights_only_unpickler._get_user_allowed_globals().keys()
+    )
+    safe_global_strings = default_safe_globals_strings.union(user_safe_global_strings)
+
+    with _open_file_like(f, "rb") as opened_file:
+        if not _is_zipfile(opened_file):
+            raise ValueError("Expected input to be a checkpoint returned by torch.save")
+        with _open_zipfile_reader(opened_file) as zip_file:
+            if _is_torchscript_zip(zip_file):
+                raise ValueError(
+                    "Expected input to be a checkpoint returned by torch.save but got a torchscript checkpoint"
+                )
+            data_file = io.BytesIO(zip_file.get_record("data.pkl"))
+            all_globals = _weights_only_unpickler.get_globals_in_pkl(data_file)
+            return list(all_globals.difference(safe_global_strings))
+
+
+class skip_data:
+    """
+    Context-manager that skips writing storage bytes for ``torch.save`` calls.
+
+    Storages will still be saved, but the space that their bytes would usually be written to
+    will be empty space. The storage bytes can then be populated in a separate pass.
+
+    .. warning::
+        The ``skip_data`` context manager is an early prototype and is subject to change.
+
+    Args:
+        materialize_fake_tensors: Whether to materialize FakeTensors.
+
+    Example:
+        >>> # xdoctest: +SKIP("NamedTemporaryFile on Windows")
+        >>> import tempfile
+        >>> t = torch.randn(2, 3)
+        >>> with tempfile.NamedTemporaryFile() as f:
+        ...     with torch.serialization.skip_data():
+        ...         torch.save(t, f.name)
+        ...     torch.load(f.name, weights_only=True)
+        tensor([[0., 0., 0.],
+                [0., 0., 0.]])
+    """
+
+    def __init__(self, materialize_fake_tensors: bool = False):
+        self.materialize_fake_tensors = materialize_fake_tensors
+
+    def __enter__(self):
+        global _serialization_tls
+        self._old_skip_data = _serialization_tls.skip_data
+        self._old_materialize_fake_tensors = _serialization_tls.materialize_fake_tensors
+        _serialization_tls.skip_data = True
+        _serialization_tls.materialize_fake_tensors = self.materialize_fake_tensors
+
+    def __exit__(self, type, value, tb):
+        global _serialization_tls
+        _serialization_tls.skip_data = self._old_skip_data
+        _serialization_tls.materialize_fake_tensors = self._old_materialize_fake_tensors
+
+
+def _is_zipfile(f) -> bool:
+    # This is a stricter implementation than zipfile.is_zipfile().
+    # zipfile.is_zipfile() is True if the magic number appears anywhere in the
+    # binary. Since we expect the files here to be generated by torch.save or
+    # torch.jit.save, it's safe to only check the start bytes and avoid
+    # collisions and assume the zip has only 1 file.
+    # See bugs.python.org/issue28494.
+
+    start = f.tell()
+    # Read the first few bytes and match against the ZIP file signature
+    local_header_magic_number = b"PK\x03\x04"
+    read_bytes = f.read(len(local_header_magic_number))
+    f.seek(start)
+    return read_bytes == local_header_magic_number
+
+
+def register_package(
+    priority: int,
+    tagger: Callable[[STORAGE], Optional[str]],
+    deserializer: Callable[[STORAGE, str], Optional[STORAGE]],
+):
+    """
+    Registers callables for tagging and deserializing storage objects with an associated priority.
+    Tagging associates a device with a storage object at save time while deserializing moves a
+    storage object to an appropriate device at load time. :attr:`tagger` and :attr:`deserializer`
+    are run in the order given by their :attr:`priority` until a tagger/deserializer returns a
+    value that is not `None`.
+
+    To override the deserialization behavior for a device in the global registry, one can register a
+    tagger with a higher priority than the existing tagger.
+
+    This function can also be used to register a tagger and deserializer for new devices.
+
+    Args:
+        priority: Indicates the priority associated with the tagger and deserializer, where a lower
+            value indicates higher priority.
+        tagger: Callable that takes in a storage object and returns its tagged device as a string
+            or None.
+        deserializer: Callable that takes in storage object and a device string and returns a storage
+            object on the appropriate device or None.
+
+    Returns:
+        `None`
+
+    Example:
+        >>> def ipu_tag(obj):
+        >>>     if obj.device.type == 'ipu':
+        >>>         return 'ipu'
+        >>> def ipu_deserialize(obj, location):
+        >>>     if location.startswith('ipu'):
+        >>>         ipu = getattr(torch, "ipu", None)
+        >>>         assert ipu is not None, "IPU device module is not loaded"
+        >>>         assert torch.ipu.is_available(), "ipu is not available"
+        >>>         return obj.ipu(location)
+        >>> torch.serialization.register_package(11, ipu_tag, ipu_deserialize)
+    """
+    queue_elem = (priority, tagger, deserializer)
+    _package_registry.append(queue_elem)
+    _package_registry.sort()
+
+
+def check_module_version_greater_or_equal(
+    module,
+    req_version_tuple,
+    error_if_malformed=True,
+):
+    """
+    Check if a module's version satisfies requirements
+
+    Usually, a module's version string will be like 'x.y.z', which would be represented
+    as a tuple (x, y, z), but sometimes it could be an unexpected format. If the version
+    string does not match the given tuple's format up to the length of the tuple, then
+    error and exit or emit a warning.
+
+    Args:
+        module: the module to check the version of
+        req_version_tuple: tuple (usually of ints) representing the required version
+        error_if_malformed: whether we should exit if module version string is malformed
+
+    Returns:
+        requirement_is_met: bool
+    """
+    try:
+        version_strs = module.__version__.split(".")
+        # Cast module version fields to match the types of the required version
+        module_version = tuple(
+            type(req_field)(version_strs[idx])
+            for idx, req_field in enumerate(req_version_tuple)
+        )
+        requirement_is_met = module_version >= req_version_tuple
+
+    except Exception as e:
+        message = (
+            f"'{module.__name__}' module version string is malformed '{module.__version__}' and cannot be compared"
+            f" with tuple {str(req_version_tuple)}"
+        )
+        if error_if_malformed:
+            raise RuntimeError(message) from e
+        else:
+            warnings.warn(message + ", but continuing assuming that requirement is met")
+            requirement_is_met = True
+
+    return requirement_is_met
+
+
+def _cpu_tag(obj):
+    if obj.device.type == "cpu":
+        return "cpu"
+
+
+def _mps_tag(obj):
+    if obj.device.type == "mps":
+        return "mps"
+
+
+def _meta_tag(obj):
+    if obj.device.type == "meta":
+        return "meta"
+
+
+def _backend_tag(backend_name, obj):
+    if backend_name == "privateuse1":
+        backend_name = torch._C._get_privateuse1_backend_name()
+    if obj.device.type == backend_name:
+        if obj.device.index is None:
+            return backend_name
+        else:
+            return backend_name + ":" + str(obj.device.index)
+
+
+def _cpu_deserialize(obj, location):
+    if location == "cpu":
+        return obj
+
+
+def _mps_deserialize(obj, location):
+    if location.startswith("mps"):
+        return obj.mps()
+
+
+def _meta_deserialize(obj, location):
+    if location == "meta":
+        return torch.UntypedStorage(obj.nbytes(), device="meta")
+
+
+def _validate_device(location, backend_name):
+    """
+    Check whether the device index of specified backend is valid
+
+    In case of privateuse1 backend, your must first register a device_module for
+    privateuse1 using torch._register_device_module. Implement the following
+    methods in device_module like cuda: device_module._utils._get_device_index(location, True),
+    device_module.device_count().
+
+    Args:
+        location: string of device
+        backend_name: the backend name or the name of privateuse1, which can be renamed
+
+    Returns:
+        device_index: int
+    """
+    if not hasattr(torch, backend_name):
+        raise RuntimeError(
+            f"The {backend_name.upper()} device module is not registered. "
+            "If you are running on a CPU-only machine, "
+            "please use torch.load with map_location=torch.device('cpu') "
+            "to map your storages to the CPU."
+        )
+    device_module = getattr(torch, backend_name)
+    if hasattr(device_module, "_utils") and hasattr(
+        device_module._utils, "_get_device_index"
+    ):
+        device_index = device_module._utils._get_device_index(location, True)
+        device = torch.device(backend_name, device_index)
+    else:
+        device = torch.device(location)
+        device_index = device.index if device.index else 0
+    if hasattr(device_module, "is_available") and not device_module.is_available():
+        raise RuntimeError(
+            f"Attempting to deserialize object on a {backend_name.upper()} "
+            f"device but torch.{backend_name}.is_available() is False. "
+            "If you are running on a CPU-only machine, "
+            "please use torch.load with map_location=torch.device('cpu') "
+            "to map your storages to the CPU."
+        )
+    if hasattr(device_module, "device_count"):
+        device_count = device_module.device_count()
+        if device_index >= device_count:
+            raise RuntimeError(
+                f"Attempting to deserialize object on {backend_name.upper()} device "
+                f"{device_index} but torch.{backend_name}.device_count() is {device_count}. "
+                "Please use torch.load with map_location to map your storages "
+                "to an existing device."
+            )
+    return device
+
+
+def validate_cuda_device(location):
+    return _validate_device(location, "cuda").index
+
+
+def validate_hpu_device(location):
+    return _validate_device(location, "hpu").index
+
+
+def _deserialize(backend_name, obj, location):
+    if backend_name == "privateuse1":
+        backend_name = torch._C._get_privateuse1_backend_name()
+    if location.startswith(backend_name):
+        device = _validate_device(location, backend_name)
+        return obj.to(device=device)
+
+
+register_package(10, _cpu_tag, _cpu_deserialize)
+register_package(
+    20,
+    functools.partial(_backend_tag, "cuda"),
+    functools.partial(_deserialize, "cuda"),
+)
+register_package(21, _mps_tag, _mps_deserialize)
+register_package(22, _meta_tag, _meta_deserialize)
+register_package(
+    23,
+    functools.partial(_backend_tag, "privateuse1"),
+    functools.partial(_deserialize, "privateuse1"),
+)
+register_package(
+    24,
+    functools.partial(_backend_tag, "hpu"),
+    functools.partial(_deserialize, "hpu"),
+)
+register_package(
+    25,
+    functools.partial(_backend_tag, "xpu"),
+    functools.partial(_deserialize, "xpu"),
+)
+
+
+def location_tag(
+    storage: Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage],
+):
+    for _, tagger, _ in _package_registry:
+        location = tagger(storage)
+        if location:
+            return location
+    raise RuntimeError(
+        "don't know how to determine data location of " + torch.typename(storage)
+    )
+
+
+def default_restore_location(storage, location):
+    """
+    Restores `storage` using a deserializer function registered for the `location`.
+
+    This function looks in the registry for deserializer functions that match the `location`.
+    If found, it attempts to use them, in priority order, to restore `storage` until one
+    returns a not `None` result. If no deserializer can be found in the registry, or all found fail
+    to bear a result, it raises a `RuntimeError`.
+
+    Args:
+        storage (STORAGE): the storage object to restore
+        location (str): the location tag associated with the storage object
+
+    Returns:
+        storage: Optional[STORAGE]
+
+    Raises:
+        RuntimeError: If no deserializer matching `location` is found in the registry or if
+           all matching ones return `None`.
+    """
+    for _, _, fn in _package_registry:
+        result = fn(storage, location)
+        if result is not None:
+            return result
+    raise RuntimeError(
+        "don't know how to restore data location of "
+        + torch.typename(storage)
+        + " (tagged with "
+        + location
+        + ")"
+    )
+
+
+def normalize_storage_type(storage_type):
+    return getattr(torch, storage_type.__name__)
+
+
+def storage_to_tensor_type(storage):
+    storage_type = type(storage)
+    module = _import_dotted_name(storage_type.__module__)
+    return getattr(module, storage_type.__name__.replace("Storage", "Tensor"))
+
+
+def _is_path(name_or_buffer) -> TypeIs[Union[str, os.PathLike]]:
+    return isinstance(name_or_buffer, (str, os.PathLike))
+
+
+class _opener:
+    def __init__(self, file_like):
+        self.file_like = file_like
+
+    def __enter__(self):
+        return self.file_like
+
+    def __exit__(self, *args):
+        pass
+
+
+class _open_file(_opener):
+    def __init__(self, name, mode):
+        super().__init__(open(name, mode))
+
+    def __exit__(self, *args):
+        self.file_like.close()
+
+
+class _open_buffer_reader(_opener):
+    def __init__(self, buffer):
+        super().__init__(buffer)
+        _check_seekable(buffer)
+
+
+class _open_buffer_writer(_opener):
+    def __exit__(self, *args):
+        self.file_like.flush()
+
+
+def _open_file_like(name_or_buffer, mode):
+    if _is_path(name_or_buffer):
+        return _open_file(name_or_buffer, mode)
+    else:
+        if "w" in mode:
+            return _open_buffer_writer(name_or_buffer)
+        elif "r" in mode:
+            return _open_buffer_reader(name_or_buffer)
+        else:
+            raise RuntimeError(f"Expected 'r' or 'w' in mode but got {mode}")
+
+
+class _open_zipfile_reader(_opener):
+    def __init__(self, name_or_buffer) -> None:
+        super().__init__(torch._C.PyTorchFileReader(name_or_buffer))
+
+
+class _open_zipfile_writer_file(_opener):
+    def __init__(self, name) -> None:
+        self.file_stream = None
+        self.name = str(name)
+        try:
+            self.name.encode("ascii")
+        except UnicodeEncodeError:
+            # PyTorchFileWriter only supports ascii filename.
+            # For filenames with non-ascii characters, we rely on Python
+            # for writing out the file.
+            self.file_stream = io.FileIO(self.name, mode="w")
+            super().__init__(
+                torch._C.PyTorchFileWriter(self.file_stream, _compute_crc32)
+            )
+        else:
+            super().__init__(torch._C.PyTorchFileWriter(self.name, _compute_crc32))
+
+    def __exit__(self, *args) -> None:
+        self.file_like.write_end_of_file()
+        if self.file_stream is not None:
+            self.file_stream.close()
+
+
+class _open_zipfile_writer_buffer(_opener):
+    def __init__(self, buffer) -> None:
+        if not callable(getattr(buffer, "write", None)):
+            msg = f"Buffer of {str(type(buffer)).strip('<>')} has no callable attribute 'write'"
+            if not hasattr(buffer, "write"):
+                raise AttributeError(msg)
+            raise TypeError(msg)
+        self.buffer = buffer
+        super().__init__(torch._C.PyTorchFileWriter(buffer, _compute_crc32))
+
+    def __exit__(self, *args) -> None:
+        self.file_like.write_end_of_file()
+        self.buffer.flush()
+
+
+def _open_zipfile_writer(name_or_buffer):
+    container: Type[_opener]
+    if _is_path(name_or_buffer):
+        container = _open_zipfile_writer_file
+    else:
+        container = _open_zipfile_writer_buffer
+    return container(name_or_buffer)
+
+
+def _is_compressed_file(f) -> bool:
+    compress_modules = ["gzip"]
+    try:
+        return f.__module__ in compress_modules
+    except AttributeError:
+        return False
+
+
+def _should_read_directly(f):
+    """
+    Checks if f is a file that should be read directly. It should be read
+    directly if it is backed by a real file (has a fileno) and is not a
+    a compressed file (e.g. gzip)
+    """
+    if _is_compressed_file(f):
+        return False
+    try:
+        return f.fileno() >= 0
+    except io.UnsupportedOperation:
+        return False
+    except AttributeError:
+        return False
+
+
+def _check_seekable(f) -> bool:
+    def raise_err_msg(patterns, e):
+        for p in patterns:
+            if p in str(e):
+                msg = (
+                    str(e)
+                    + ". You can only torch.load from a file that is seekable."
+                    + " Please pre-load the data into a buffer like io.BytesIO and"
+                    + " try to load from it instead."
+                )
+                raise type(e)(msg)
+        raise e
+
+    try:
+        f.seek(f.tell())
+        return True
+    except (io.UnsupportedOperation, AttributeError) as e:
+        raise_err_msg(["seek", "tell"], e)
+    return False
+
+
+def _check_dill_version(pickle_module) -> None:
+    """Checks if using dill as the pickle module, and if so, checks if it is the correct version.
+    If dill version is lower than 0.3.1, a ValueError is raised.
+
+    Args:
+        pickle_module: module used for pickling metadata and objects
+
+    """
+    if pickle_module is not None and pickle_module.__name__ == "dill":
+        required_dill_version = (0, 3, 1)
+        if not check_module_version_greater_or_equal(
+            pickle_module, required_dill_version, False
+        ):
+            raise ValueError(
+                (
+                    "'torch' supports dill >= {}, but you have dill {}."
+                    " Please upgrade dill or switch to 'pickle'"
+                ).format(
+                    ".".join([str(num) for num in required_dill_version]),
+                    pickle_module.__version__,
+                )
+            )
+
+
+def _check_save_filelike(f):
+    if not _is_path(f) and not hasattr(f, "write"):
+        raise AttributeError(
+            "expected 'f' to be string, path, or a file-like object with "
+            "a 'write' attribute"
+        )
+
+
+def save(
+    obj: object,
+    f: FILE_LIKE,
+    pickle_module: Any = pickle,
+    pickle_protocol: int = DEFAULT_PROTOCOL,
+    _use_new_zipfile_serialization: bool = True,
+    _disable_byteorder_record: bool = False,
+) -> None:
+    # Reference: https://github.com/pytorch/pytorch/issues/54354
+    # The first line of this docstring overrides the one Sphinx generates for the
+    # documentation. We need it so that Sphinx doesn't leak `pickle`s path from
+    # the build environment (e.g. `<module 'pickle' from '/leaked/path').
+
+    """save(obj, f, pickle_module=pickle, pickle_protocol=2, _use_new_zipfile_serialization=True)
+
+    Saves an object to a disk file.
+
+    See also: :ref:`saving-loading-tensors`
+
+    Args:
+        obj: saved object
+        f: a file-like object (has to implement write and flush) or a string or
+           os.PathLike object containing a file name
+        pickle_module: module used for pickling metadata and objects
+        pickle_protocol: can be specified to override the default protocol
+
+    .. note::
+        A common PyTorch convention is to save tensors using .pt file extension.
+
+    .. note::
+        PyTorch preserves storage sharing across serialization. See
+        :ref:`preserve-storage-sharing` for more details.
+
+    .. note::
+        The 1.6 release of PyTorch switched ``torch.save`` to use a new
+        zipfile-based file format. ``torch.load`` still retains the ability to
+        load files in the old format. If for any reason you want ``torch.save``
+        to use the old format, pass the kwarg ``_use_new_zipfile_serialization=False``.
+
+    Example:
+        >>> # xdoctest: +SKIP("makes cwd dirty")
+        >>> # Save to file
+        >>> x = torch.tensor([0, 1, 2, 3, 4])
+        >>> torch.save(x, "tensor.pt")
+        >>> # Save to io.BytesIO buffer
+        >>> buffer = io.BytesIO()
+        >>> torch.save(x, buffer)
+    """
+    torch._C._log_api_usage_once("torch.save")
+    _check_dill_version(pickle_module)
+    _check_save_filelike(f)
+
+    if _use_new_zipfile_serialization:
+        with _open_zipfile_writer(f) as opened_zipfile:
+            _save(
+                obj,
+                opened_zipfile,
+                pickle_module,
+                pickle_protocol,
+                _disable_byteorder_record,
+            )
+            return
+    else:
+        global _serialization_tls
+        if _serialization_tls.skip_data:
+            raise RuntimeError(
+                "Cannot use skip_data=True with _use_new_zipfile_serialization=False"
+            )
+        with _open_file_like(f, "wb") as opened_file:
+            _legacy_save(obj, opened_file, pickle_module, pickle_protocol)
+
+
+def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None:
+    import torch.nn as nn
+
+    serialized_container_types = {}
+    serialized_storages: Dict[str, Tuple[torch.UntypedStorage, torch.dtype]] = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    # TODO: This feature could be added in the future
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj: Any) -> Optional[Tuple]:
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            if obj in serialized_container_types:
+                return None
+            serialized_container_types[obj] = True
+            source_file = source = None
+            try:
+                source_lines, _, source_file = get_source_lines_and_file(obj)
+                source = "".join(source_lines)
+            except (
+                Exception
+            ):  # saving the source is optional, so we can ignore any errors
+                warnings.warn(
+                    "Couldn't retrieve source code for container of "
+                    "type " + obj.__name__ + ". It won't be checked "
+                    "for correctness upon loading."
+                )
+            return ("module", obj, source_file, source)
+
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            storage: torch.UntypedStorage
+
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                dtype = obj.dtype
+                storage_numel = obj._size()
+
+            elif isinstance(obj, torch.UntypedStorage):
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                dtype = torch.uint8
+                storage_numel = storage.nbytes()
+            else:
+                raise TypeError(f"type not recognized: {type(obj)}")
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that "
+                            "view the same data as different types"
+                        )
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            view_metadata: Optional[Tuple[str, int, int]]
+
+            # Offset is always 0, but we keep it for backwards compatibility
+            # with the old serialization format (which supported storage views)
+            offset = 0
+            storage_key = str(storage._cdata)
+            location = location_tag(storage)
+
+            # TODO: There's an issue here with FC. It might be impossible to
+            # solve, but it's worth noting. Imagine we save a list `[storage,
+            # tensor]`, where `tensor.storage()` is the same as `storage`, and
+            # `tensor.element_size() > 1`. Let's say that `tensor.dtype ==
+            # torch.float`.  The storage will be serialized with element size
+            # of 1, since we're choosing to serialize the first occurance of
+            # a duplicate storage. Since this legacy serialization format saves
+            # the numel of the storage, rather than nbytes directly, we'll be
+            # effectively saving nbytes in this case.  We'll be able to load it
+            # and the tensor back up with no problems in _this_ and future
+            # versions of pytorch, but in older versions, here's the problem:
+            # the storage will be loaded up as a UntypedStorage, and then the
+            # FloatTensor will loaded and the UntypedStorage will be assigned to
+            # it. Since the storage dtype does not match the tensor dtype, this
+            # will cause an error.  If we reverse the list, like `[tensor,
+            # storage]`, then we will save the `tensor.storage()` as a faked
+            # `FloatStorage`, and the saved size will be the correct
+            # dtype-specific numel count that old versions expect. `tensor`
+            # will be able to load up properly in old versions, pointing to
+            # a FloatStorage. However, `storage` is still being translated to
+            # a UntypedStorage, and it will try to resolve to the same
+            # FloatStorage that `tensor` contains. This will also cause an
+            # error. It doesn't seem like there's any way around this.
+            # Probably, we just cannot maintain FC for the legacy format if the
+            # saved list contains both a tensor and a storage that point to the
+            # same data.  We should still be able to maintain FC for lists of
+            # just tensors, as long as all views share the same dtype as the
+            # tensor they are viewing.
+
+            if storage_key not in serialized_storages:
+                serialized_storages[storage_key] = (storage, dtype)
+            is_view = storage._cdata != storage._cdata
+            if is_view:
+                view_metadata = (str(storage._cdata), offset, storage.nbytes())
+            else:
+                view_metadata = None
+
+            res = (
+                "storage",
+                storage_type,
+                storage_key,
+                location,
+                storage_numel,
+                view_metadata,
+            )
+            return res
+        return None
+
+    sys_info = dict(
+        protocol_version=PROTOCOL_VERSION,
+        little_endian=sys.byteorder == "little",
+        type_sizes=dict(
+            short=SHORT_SIZE,
+            int=INT_SIZE,
+            long=LONG_SIZE,
+        ),
+    )
+
+    pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol)
+    pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol)
+    pickle_module.dump(sys_info, f, protocol=pickle_protocol)
+
+    class PyTorchLegacyPickler(pickle_module.Pickler):
+        def persistent_id(self, obj):
+            return persistent_id(obj)
+
+    pickler = PyTorchLegacyPickler(f, protocol=pickle_protocol)
+    pickler.dump(obj)
+
+    serialized_storage_keys = sorted(serialized_storages.keys())
+    pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol)
+    f.flush()
+    if hasattr(f, 'save_torch_storage_object_list'):
+        sorted_storage_objects = [serialized_storages[key] for key in serialized_storage_keys]
+        f.save_torch_storage_object_list(sorted_storage_objects, True)
+    else:
+        for key in serialized_storage_keys:
+            storage, dtype = serialized_storages[key]
+            storage._write_file(
+                f, _should_read_directly(f), True, torch._utils._element_size(dtype)
+            )
+
+
+def _save(
+    obj,
+    zip_file,
+    pickle_module,
+    pickle_protocol,
+    _disable_byteorder_record,
+):
+    serialized_storages = {}
+    id_map: Dict[int, str] = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    # TODO: This feature could be added in the future
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage_numel = obj._size()
+
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                storage_numel = storage.nbytes()
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if str(storage.device) != "meta" and storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that "
+                            "view the same data as different types"
+                        )
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            storage_key = id_map.setdefault(storage._cdata, str(len(id_map)))
+            if hasattr(obj, "_fake_device") and obj._fake_device is not None:
+                location = str(obj._fake_device)
+            else:
+                location = location_tag(storage)
+            serialized_storages[storage_key] = storage
+
+            return ("storage", storage_type, storage_key, location, storage_numel)
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+
+    class PyTorchPickler(pickle_module.Pickler):  # type: ignore[name-defined]
+        def persistent_id(self, obj):
+            return persistent_id(obj)
+
+    pickler = PyTorchPickler(data_buf, protocol=pickle_protocol)
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    zip_file.write_record("data.pkl", data_value, len(data_value))
+
+    # Write byte order marker
+    if not _disable_byteorder_record:
+        if sys.byteorder not in ["little", "big"]:
+            raise ValueError("Unknown endianness type: " + sys.byteorder)
+
+        zip_file.write_record("byteorder", sys.byteorder, len(sys.byteorder))
+
+    # Write each tensor to a file named tensor/the_tensor_key in the zip archive
+    for key in sorted(serialized_storages.keys()):
+        name = f"data/{key}"
+        storage = serialized_storages[key]
+        num_bytes = storage.nbytes()
+        global _serialization_tls
+        if _serialization_tls.skip_data:
+            zip_file.write_record_metadata(name, num_bytes)
+        else:
+            # given that we copy things around anyway, we might use storage.cpu()
+            # this means to that to get tensors serialized, you need to implement
+            # .cpu() on the underlying Storage
+            if storage.device.type != "cpu":
+                storage = storage.cpu()
+            # Now that it is on the CPU we can directly copy it into the zip file
+            zip_file.write_record(name, storage, num_bytes)
+
+
+def load(
+    f: FILE_LIKE,
+    map_location: MAP_LOCATION = None,
+    pickle_module: Any = None,
+    *,
+    weights_only: Optional[bool] = None,
+    mmap: Optional[bool] = None,
+    **pickle_load_args: Any,
+) -> Any:
+    # Reference: https://github.com/pytorch/pytorch/issues/54354
+    # The first line of this docstring overrides the one Sphinx generates for the
+    # documentation. We need it so that Sphinx doesn't leak `pickle`s path from
+    # the build environment (e.g. `<module 'pickle' from '/leaked/path').
+
+    """load(f, map_location=None, pickle_module=pickle, *, weights_only=True, mmap=None, **pickle_load_args)
+
+    Loads an object saved with :func:`torch.save` from a file.
+
+    :func:`torch.load` uses Python's unpickling facilities but treats storages,
+    which underlie tensors, specially. They are first deserialized on the
+    CPU and are then moved to the device they were saved from. If this fails
+    (e.g. because the run time system doesn't have certain devices), an exception
+    is raised. However, storages can be dynamically remapped to an alternative
+    set of devices using the :attr:`map_location` argument.
+
+    If :attr:`map_location` is a callable, it will be called once for each serialized
+    storage with two arguments: storage and location. The storage argument
+    will be the initial deserialization of the storage, residing on the CPU.
+    Each serialized storage has a location tag associated with it which
+    identifies the device it was saved from, and this tag is the second
+    argument passed to :attr:`map_location`. The builtin location tags are ``'cpu'``
+    for CPU tensors and ``'cuda:device_id'`` (e.g. ``'cuda:2'``) for CUDA tensors.
+    :attr:`map_location` should return either ``None`` or a storage. If
+    :attr:`map_location` returns a storage, it will be used as the final deserialized
+    object, already moved to the right device. Otherwise, :func:`torch.load` will
+    fall back to the default behavior, as if :attr:`map_location` wasn't specified.
+
+    If :attr:`map_location` is a :class:`torch.device` object or a string containing
+    a device tag, it indicates the location where all tensors should be loaded.
+
+    Otherwise, if :attr:`map_location` is a dict, it will be used to remap location tags
+    appearing in the file (keys), to ones that specify where to put the
+    storages (values).
+
+    User extensions can register their own location tags and tagging and
+    deserialization methods using :func:`torch.serialization.register_package`.
+
+    Args:
+        f: a file-like object (has to implement :meth:`read`, :meth:`readline`, :meth:`tell`, and :meth:`seek`),
+            or a string or os.PathLike object containing a file name
+        map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
+            locations
+        pickle_module: module used for unpickling metadata and objects (has to
+            match the :attr:`pickle_module` used to serialize file)
+        weights_only: Indicates whether unpickler should be restricted to
+            loading only tensors, primitive types, dictionaries
+            and any types added via :func:`torch.serialization.add_safe_globals`.
+            See :ref:`weights-only` for more details.
+        mmap: Indicates whether the file should be mmaped rather than loading all the storages into memory.
+            Typically, tensor storages in the file will first be moved from disk to CPU memory, after which they
+            are moved to the location that they were tagged with when saving, or specified by ``map_location``. This
+            second step is a no-op if the final location is CPU. When the ``mmap`` flag is set, instead of copying the
+            tensor storages from disk to CPU memory in the first step, ``f`` is mmaped.
+        pickle_load_args: (Python 3 only) optional keyword arguments passed over to
+            :func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g.,
+            :attr:`errors=...`.
+
+    .. warning::
+        :func:`torch.load()` unless `weights_only` parameter is set to `True`,
+        uses ``pickle`` module implicitly, which is known to be insecure.
+        It is possible to construct malicious pickle data which will execute arbitrary code
+        during unpickling. Never load data that could have come from an untrusted
+        source in an unsafe mode, or that could have been tampered with. **Only load data you trust**.
+
+    .. note::
+        When you call :func:`torch.load()` on a file which contains GPU tensors, those tensors
+        will be loaded to GPU by default. You can call ``torch.load(.., map_location='cpu')``
+        and then :meth:`load_state_dict` to avoid GPU RAM surge when loading a model checkpoint.
+
+    .. note::
+        By default, we decode byte strings as ``utf-8``.  This is to avoid a common error
+        case ``UnicodeDecodeError: 'ascii' codec can't decode byte 0x...``
+        when loading files saved by Python 2 in Python 3.  If this default
+        is incorrect, you may use an extra :attr:`encoding` keyword argument to specify how
+        these objects should be loaded, e.g., :attr:`encoding='latin1'` decodes them
+        to strings using ``latin1`` encoding, and :attr:`encoding='bytes'` keeps them
+        as byte arrays which can be decoded later with ``byte_array.decode(...)``.
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined filepaths")
+        >>> torch.load("tensors.pt", weights_only=True)
+        # Load all tensors onto the CPU
+        >>> torch.load("tensors.pt", map_location=torch.device("cpu"), weights_only=True)
+        # Load all tensors onto the CPU, using a function
+        >>> torch.load(
+        ...     "tensors.pt", map_location=lambda storage, loc: storage, weights_only=True
+        ... )
+        # Load all tensors onto GPU 1
+        >>> torch.load(
+        ...     "tensors.pt",
+        ...     map_location=lambda storage, loc: storage.cuda(1),
+        ...     weights_only=True,
+        ... )  # type: ignore[attr-defined]
+        # Map tensors from GPU 1 to GPU 0
+        >>> torch.load("tensors.pt", map_location={"cuda:1": "cuda:0"}, weights_only=True)
+        # Load tensor from io.BytesIO object
+        # Loading from a buffer setting weights_only=False, warning this can be unsafe
+        >>> with open("tensor.pt", "rb") as f:
+        ...     buffer = io.BytesIO(f.read())
+        >>> torch.load(buffer, weights_only=False)
+        # Load a module with 'ascii' encoding for unpickling
+        # Loading from a module setting weights_only=False, warning this can be unsafe
+        >>> torch.load("module.pt", encoding="ascii", weights_only=False)
+    """
+    torch._C._log_api_usage_once("torch.load")
+    DOCS_MESSAGE = (
+        "\n\nCheck the documentation of torch.load to learn more about types accepted by default with "
+        "weights_only https://pytorch.org/docs/stable/generated/torch.load.html."
+    )
+
+    def _get_wo_message(message: str) -> str:
+        unsafe_global_pattern = r"GLOBAL (\S+) was not an allowed global by default."
+        has_unsafe_global = re.search(unsafe_global_pattern, message) is not None
+        blocklist_pattern = r"whose module (\S+) is blocked"
+        has_blocklist = re.search(blocklist_pattern, message) is not None
+        import_pattern = r"(\S+) must be (\S+) to load"
+        has_import = re.search(import_pattern, message) is not None
+        if has_unsafe_global:
+            updated_message = (
+                "Weights only load failed. This file can still be loaded, to do so you have two options, "
+                "\033[1mdo those steps only if you trust the source of the checkpoint\033[0m. "
+                f"\n\t(1) {UNSAFE_MESSAGE}\n\t(2) Alternatively, to load with `weights_only=True` please check "
+                "the recommended steps in the following error message.\n\tWeightsUnpickler error: "
+                + message
+            )
+        else:
+            if has_import:
+                return f"Weights only load failed. {message}\n {UNSAFE_MESSAGE}\n"
+            else:
+                updated_message = f"Weights only load failed. {UNSAFE_MESSAGE}\n"
+                if not has_blocklist:
+                    updated_message += (
+                        "Please file an issue with the following so that we can make "
+                        "`weights_only=True` compatible with your use case: WeightsUnpickler error: "
+                    )
+            updated_message += message
+        return updated_message + DOCS_MESSAGE
+
+    global _serialization_tls
+    skip_data = _serialization_tls.skip_data
+    if skip_data:
+        raise RuntimeError(
+            "`torch.load` called within a torch.serialization.skip_data context manager "
+            "is not supported yet. Please call torch.load outside the skip_data context manager."
+        )
+
+    weights_only_not_set = weights_only is None
+
+    if weights_only_not_set:
+        weights_only = _default_to_weights_only(pickle_module)
+
+    true_values = ["1", "y", "yes", "true"]
+    # Add ability to force safe only or non-safe weight loads via environment variables
+    force_weights_only_load = (
+        os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0") in true_values
+    )
+    force_no_weights_only_load = (
+        os.getenv("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "0") in true_values
+    )
+
+    if force_weights_only_load and force_no_weights_only_load:
+        raise RuntimeError(
+            "Only one of `TORCH_FORCE_WEIGHTS_ONLY_LOAD` or `TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD` "
+            "should be set, but both were set."
+        )
+    elif force_weights_only_load:
+        weights_only = True
+    elif force_no_weights_only_load:
+        # TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD can only override if callsite did not explicitly set weights_only
+        if weights_only_not_set:
+            warnings.warn(
+                "Environment variable TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD detected, since the"
+                "`weights_only` argument was not explicitly passed to `torch.load`, forcing weights_only=False.",
+                UserWarning,
+                stacklevel=2,
+            )
+            weights_only = False
+
+    if weights_only:
+        if pickle_module is not None:
+            raise RuntimeError(
+                "Can not safely load weights when explicit pickle_module is specified"
+            )
+    else:
+        if pickle_module is None:
+            pickle_module = pickle
+
+    # make flipping default BC-compatible
+    if mmap is None:
+        mmap = False
+
+    _check_dill_version(pickle_module)
+
+    if "encoding" not in pickle_load_args.keys():
+        pickle_load_args["encoding"] = "utf-8"
+
+    with _open_file_like(f, "rb") as opened_file:
+        if _is_zipfile(opened_file):
+            # The zipfile reader is going to advance the current file position.
+            # If we want to actually tail call to torch.jit.load, we need to
+            # reset back to the original position.
+            orig_position = opened_file.tell()
+            overall_storage = None
+            with _open_zipfile_reader(opened_file) as opened_zipfile:
+                if _is_torchscript_zip(opened_zipfile):
+                    warnings.warn(
+                        "'torch.load' received a zip file that looks like a TorchScript archive"
+                        " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to"
+                        " silence this warning)",
+                        UserWarning,
+                    )
+                    if weights_only:
+                        raise RuntimeError(
+                            "Cannot use ``weights_only=True`` with TorchScript archives passed to "
+                            "``torch.load``. " + UNSAFE_MESSAGE
+                        )
+                    opened_file.seek(orig_position)
+                    return torch.jit.load(opened_file, map_location=map_location)
+                if mmap:
+                    if not _is_path(f):
+                        raise ValueError(
+                            "f must be a file path in order to use the mmap argument"
+                        )
+                    size = os.path.getsize(f)
+                    if not IS_WINDOWS:
+                        shared = get_default_mmap_options() == MAP_SHARED
+                    else:
+                        shared = False
+                    overall_storage = torch.UntypedStorage.from_file(
+                        os.fspath(f), shared, size
+                    )
+                if weights_only:
+                    try:
+                        return _load(
+                            opened_zipfile,
+                            map_location,
+                            _weights_only_unpickler,
+                            overall_storage=overall_storage,
+                            **pickle_load_args,
+                        )
+                    except pickle.UnpicklingError as e:
+                        raise pickle.UnpicklingError(_get_wo_message(str(e))) from None
+                return _load(
+                    opened_zipfile,
+                    map_location,
+                    pickle_module,
+                    overall_storage=overall_storage,
+                    **pickle_load_args,
+                )
+        if mmap:
+            f_name = "" if not isinstance(f, str) else f"{f}, "
+            raise RuntimeError(
+                "mmap can only be used with files saved with "
+                f"`torch.save({f_name}_use_new_zipfile_serialization=True), "
+                "please torch.save your checkpoint with this option in order to use mmap."
+            )
+        if weights_only:
+            try:
+                return _legacy_load(
+                    opened_file,
+                    map_location,
+                    _weights_only_unpickler,
+                    **pickle_load_args,
+                )
+            except pickle.UnpicklingError as e:
+                raise pickle.UnpicklingError(_get_wo_message(str(e))) from None
+        return _legacy_load(
+            opened_file, map_location, pickle_module, **pickle_load_args
+        )
+
+
+# Register pickling support for layout instances such as
+# torch.sparse_coo, etc
+def _get_layout(name):
+    """Get layout extension object from its string representation."""
+    cache = _get_layout.cache  # type: ignore[attr-defined]
+    if not cache:
+        for v in torch.__dict__.values():
+            if isinstance(v, torch.layout):
+                cache[str(v)] = v
+    return cache[name]
+
+
+# There are yet not good way to type annotate function attributes https://github.com/python/mypy/issues/2087
+_get_layout.cache = {}  # type: ignore[attr-defined]
+copyreg.pickle(torch.layout, lambda obj: (_get_layout, (str(obj),)))
+
+
+def _legacy_load(f, map_location, pickle_module, **pickle_load_args):
+    deserialized_objects: Dict[int, Any] = {}
+
+    restore_location = _get_restore_location(map_location)
+
+    class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
+        def find_class(self, mod_name, name):
+            if type(name) is str and "Storage" in name:
+                try:
+                    return StorageType(name)
+                except KeyError:
+                    pass
+            return super().find_class(mod_name, name)
+
+    def _check_container_source(container_type, source_file, original_source):
+        try:
+            current_source = "".join(get_source_lines_and_file(container_type)[0])
+        except Exception:  # saving the source is optional, so we can ignore any errors
+            warnings.warn(
+                "Couldn't retrieve source code for container of "
+                "type " + container_type.__name__ + ". It won't be checked "
+                "for correctness upon loading."
+            )
+            return
+        if original_source != current_source:
+            if container_type.dump_patches:
+                file_name = container_type.__name__ + ".patch"
+                diff = difflib.unified_diff(
+                    current_source.split("\n"),
+                    original_source.split("\n"),
+                    source_file,
+                    source_file,
+                    lineterm="",
+                )
+                lines = "\n".join(diff)
+                try:
+                    with open(file_name, "a+") as f:
+                        file_size = f.seek(0, 2)
+                        f.seek(0)
+                        if file_size == 0:
+                            f.write(lines)
+                        elif file_size != len(lines) or f.read() != lines:
+                            raise OSError
+                    msg = (
+                        "Saved a reverse patch to " + file_name + ". "
+                        "Run `patch -p0 < " + file_name + "` to revert your "
+                        "changes."
+                    )
+                except OSError:
+                    msg = (
+                        "Tried to save a patch, but couldn't create a "
+                        "writable file " + file_name + ". Make sure it "
+                        "doesn't exist and your working directory is "
+                        "writable."
+                    )
+            else:
+                msg = (
+                    "you can retrieve the original source code by "
+                    "accessing the object's source attribute or set "
+                    "`torch.nn.Module.dump_patches = True` and use the "
+                    "patch tool to revert the changes."
+                )
+            msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}"
+            warnings.warn(msg, SourceChangeWarning)
+
+    def legacy_load(f):
+        deserialized_objects: Dict[int, Any] = {}
+
+        def persistent_load(saved_id):
+            if isinstance(saved_id, tuple):
+                # Ignore containers that don't have any sources saved
+                if all(saved_id[1:]):
+                    _check_container_source(*saved_id)
+                return saved_id[0]
+            return deserialized_objects[int(saved_id)]
+
+        with closing(
+            tarfile.open(fileobj=f, mode="r:", format=tarfile.PAX_FORMAT)
+        ) as tar, mkdtemp() as tmpdir:
+            if pickle_module is _weights_only_unpickler:
+                raise RuntimeError(
+                    "Cannot use ``weights_only=True`` with files saved in the "
+                    "legacy .tar format. " + UNSAFE_MESSAGE
+                )
+            tar.extract("storages", path=tmpdir)
+            with open(os.path.join(tmpdir, "storages"), "rb", 0) as f:
+                num_storages = pickle_module.load(f, **pickle_load_args)
+                for _ in range(num_storages):
+                    args = pickle_module.load(f, **pickle_load_args)
+                    key, location, storage_type = args
+                    dtype = storage_type._dtype
+                    obj = cast(Storage, torch.UntypedStorage)._new_with_file(
+                        f, torch._utils._element_size(dtype)
+                    )
+                    obj = restore_location(obj, location)
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[key] = torch.storage.TypedStorage(
+                        wrap_storage=obj, dtype=dtype, _internal=True
+                    )
+
+                storage_views = pickle_module.load(f, **pickle_load_args)
+                for target_cdata, root_cdata, offset, numel in storage_views:
+                    root = deserialized_objects[root_cdata]
+                    element_size = torch._utils._element_size(root.dtype)
+                    offset_bytes = offset * element_size
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[target_cdata] = torch.storage.TypedStorage(
+                        wrap_storage=root._untyped_storage[
+                            offset_bytes : offset_bytes + numel * element_size
+                        ],
+                        dtype=root.dtype,
+                        _internal=True,
+                    )
+
+            tar.extract("tensors", path=tmpdir)
+            with open(os.path.join(tmpdir, "tensors"), "rb", 0) as f:
+                num_tensors = pickle_module.load(f, **pickle_load_args)
+                for _ in range(num_tensors):
+                    args = pickle_module.load(f, **pickle_load_args)
+                    key, storage_id, _original_tensor_type = args
+                    storage = deserialized_objects[storage_id]
+                    (ndim,) = struct.unpack("<i", f.read(4))
+                    # skip next 4 bytes; legacy encoding treated ndim as 8 bytes
+                    f.read(4)
+                    numel = struct.unpack(f"<{ndim}q", f.read(8 * ndim))
+                    stride = struct.unpack(f"<{ndim}q", f.read(8 * ndim))
+                    (storage_offset,) = struct.unpack("<q", f.read(8))
+                    tensor = torch.empty((0,), dtype=storage.dtype).set_(
+                        storage._untyped_storage, storage_offset, numel, stride
+                    )
+                    deserialized_objects[key] = tensor
+
+            pickle_file = tar.extractfile("pickle")
+            unpickler = UnpicklerWrapper(pickle_file, **pickle_load_args)
+            unpickler.persistent_load = persistent_load
+            result = unpickler.load()
+            return result
+
+    deserialized_objects = {}
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        if typename == "module":
+            # Ignore containers that don't have any sources saved
+            if all(data[1:]):
+                _check_container_source(*data)
+            return data[0]
+        elif typename == "storage":
+            storage_type, root_key, location, numel, view_metadata = data
+            location = _maybe_decode_ascii(location)
+            dtype = storage_type.dtype
+
+            nbytes = numel * torch._utils._element_size(dtype)
+
+            if root_key not in deserialized_objects:
+                if torch._guards.active_fake_mode() is not None:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes, device="meta"))
+                else:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes))
+                    obj._torch_load_uninitialized = True
+                    obj = restore_location(obj, location)
+                # TODO: Once we decide to break serialization FC, we can
+                # stop wrapping with TypedStorage
+                typed_storage = torch.storage.TypedStorage(
+                    wrap_storage=obj, dtype=dtype, _internal=True
+                )
+                deserialized_objects[root_key] = typed_storage
+            else:
+                typed_storage = deserialized_objects[root_key]
+                if typed_storage._data_ptr() == 0:
+                    typed_storage = torch.storage.TypedStorage(
+                        device=typed_storage._untyped_storage.device,
+                        dtype=dtype,
+                        _internal=True,
+                    )
+
+            if view_metadata is not None:
+                view_key, offset, view_size = view_metadata
+                offset_bytes = offset * torch._utils._element_size(dtype)
+                view_size_bytes = view_size * torch._utils._element_size(dtype)
+                if view_key not in deserialized_objects:
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[view_key] = torch.storage.TypedStorage(
+                        wrap_storage=typed_storage._untyped_storage[
+                            offset_bytes : offset_bytes + view_size_bytes
+                        ],
+                        dtype=dtype,
+                        _internal=True,
+                    )
+                res = deserialized_objects[view_key]
+
+            else:
+                res = typed_storage
+            return res
+        else:
+            raise RuntimeError(f"Unknown saved id type: {saved_id[0]}")
+
+    _check_seekable(f)
+    f_should_read_directly = _should_read_directly(f)
+
+    if f_should_read_directly and f.tell() == 0:
+        # legacy_load requires that f has fileno()
+        # only if offset is zero we can attempt the legacy tar file loader
+        try:
+            return legacy_load(f)
+        except tarfile.TarError:
+            if _is_zipfile(f):
+                # .zip is used for torch.jit.save and will throw an un-pickling error here
+                raise RuntimeError(
+                    f"{f.name} is a zip archive (did you mean to use torch.jit.load()?)"
+                ) from None
+            # if not a tarfile, reset file offset and proceed
+            f.seek(0)
+
+    if not hasattr(f, "readinto") and (3, 8, 0) <= sys.version_info < (3, 8, 2):
+        raise RuntimeError(
+            "torch.load does not work with file-like objects that do not implement readinto on Python 3.8.0 and 3.8.1. "
+            f'Received object of type "{type(f)}". Please update to Python 3.8.2 or newer to restore this '
+            "functionality."
+        )
+
+    magic_number = pickle_module.load(f, **pickle_load_args)
+    if magic_number != MAGIC_NUMBER:
+        raise RuntimeError("Invalid magic number; corrupt file?")
+    protocol_version = pickle_module.load(f, **pickle_load_args)
+    if protocol_version != PROTOCOL_VERSION:
+        raise RuntimeError(f"Invalid protocol version: {protocol_version}")
+
+    _sys_info = pickle_module.load(f, **pickle_load_args)
+    unpickler = UnpicklerWrapper(f, **pickle_load_args)
+    unpickler.persistent_load = persistent_load
+    result = unpickler.load()
+
+    deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)
+
+    if torch._guards.active_fake_mode() is None:
+        offset = f.tell() if f_should_read_directly else None
+        for key in deserialized_storage_keys:
+            assert key in deserialized_objects
+            typed_storage = deserialized_objects[key]
+            typed_storage._untyped_storage._set_from_file(
+                f,
+                offset,
+                f_should_read_directly,
+                torch._utils._element_size(typed_storage.dtype),
+            )
+            if offset is not None:
+                offset = f.tell()
+
+    torch._utils._validate_loaded_sparse_tensors()
+
+    return result
+
+
+def _maybe_decode_ascii(bytes_str: Union[bytes, str]) -> str:
+    # When using encoding='bytes' in Py3, some **internal** keys stored as
+    # strings in Py2 are loaded as bytes. This function decodes them with
+    # ascii encoding, one that Py3 uses by default.
+    #
+    # NOTE: This should only be used on internal keys (e.g., `typename` and
+    #       `location` in `persistent_load` below!
+    if isinstance(bytes_str, bytes):
+        return bytes_str.decode("ascii")
+    return bytes_str
+
+
+def _get_restore_location(map_location):
+    if map_location is None:
+        restore_location = default_restore_location
+    elif isinstance(map_location, dict):
+
+        def restore_location(storage, location):
+            location = map_location.get(location, location)
+            return default_restore_location(storage, location)
+
+    elif isinstance(map_location, (str, bytes)):
+
+        def restore_location(storage, location):
+            return default_restore_location(storage, map_location)
+
+    elif isinstance(map_location, torch.device):
+
+        def restore_location(storage, location):
+            return default_restore_location(storage, str(map_location))
+
+    else:
+
+        def restore_location(storage, location):
+            result = map_location(storage, location)
+            if result is None:
+                result = default_restore_location(storage, location)
+            return result
+
+    return restore_location
+
+
+class StorageType:
+    def __init__(self, name):
+        self._dtype = _get_dtype_from_pickle_storage_type(name)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def __str__(self):
+        return f"StorageType(dtype={self.dtype})"
+
+
+def _load(
+    zip_file,
+    map_location,
+    pickle_module,
+    pickle_file="data.pkl",
+    overall_storage=None,
+    **pickle_load_args,
+):
+    restore_location = _get_restore_location(map_location)
+
+    loaded_storages = {}
+
+    # check if byteswapping is needed
+    byteordername = "byteorder"
+    byteorderdata = None
+    if zip_file.has_record(byteordername):
+        byteorderdata = zip_file.get_record(byteordername)
+        if byteorderdata not in [b"little", b"big"]:
+            raise ValueError("Unknown endianness type: " + byteorderdata.decode())
+    elif (
+        get_default_load_endianness() == LoadEndianness.LITTLE
+        or get_default_load_endianness() is None
+    ):
+        byteorderdata = b"little"
+    elif get_default_load_endianness() == LoadEndianness.BIG:
+        byteorderdata = b"big"
+    elif get_default_load_endianness() == LoadEndianness.NATIVE:
+        pass
+    else:
+        raise ValueError("Invalid load endianness type")
+
+    if (
+        not zip_file.has_record(byteordername)
+        and get_default_load_endianness() is None
+        and sys.byteorder == "big"
+    ):
+        # Default behaviour was changed
+        # See https://github.com/pytorch/pytorch/issues/101688
+        warnings.warn(
+            "The default load endianness for checkpoints without a byteorder mark "
+            "on big endian machines was changed from 'native' to 'little' endian, "
+            "to avoid this behavior please use "
+            "torch.serialization.set_default_load_endianness to set "
+            "the desired default load endianness",
+            UserWarning,
+        )
+
+    def load_tensor(dtype, numel, key, location):
+        name = f"data/{key}"
+        if torch._guards.detect_fake_mode(None) is not None:
+            nbytes = numel * torch._utils._element_size(dtype)
+            storage = torch.UntypedStorage(nbytes, device="meta")
+        elif overall_storage is not None:
+            storage_offset = zip_file.get_record_offset(name)
+            storage = overall_storage[storage_offset : storage_offset + numel]
+        else:
+            storage = (
+                zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)
+                ._typed_storage()
+                ._untyped_storage
+            )
+        # swap here if byteswapping is needed
+        if byteorderdata is not None:
+            if byteorderdata.decode() != sys.byteorder:
+                storage.byteswap(dtype)
+
+        # TODO: Once we decide to break serialization FC, we can
+        # stop wrapping with TypedStorage
+        typed_storage = torch.storage.TypedStorage(
+            wrap_storage=restore_location(storage, location),
+            dtype=dtype,
+            _internal=True,
+        )
+
+        if typed_storage._data_ptr() != 0:
+            loaded_storages[key] = typed_storage
+
+        return typed_storage
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        assert (
+            typename == "storage"
+        ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
+        storage_type, key, location, numel = data
+        if storage_type is torch.UntypedStorage:
+            dtype = torch.uint8
+        else:
+            dtype = storage_type.dtype
+
+        if key in loaded_storages:
+            typed_storage = loaded_storages[key]
+        else:
+            nbytes = numel * torch._utils._element_size(dtype)
+            typed_storage = load_tensor(
+                dtype, nbytes, key, _maybe_decode_ascii(location)
+            )
+
+        return typed_storage
+
+    load_module_mapping: Dict[str, str] = {
+        # See https://github.com/pytorch/pytorch/pull/51633
+        "torch.tensor": "torch._tensor"
+    }
+
+    # Need to subclass Unpickler instead of directly monkey-patching the find_class method
+    # because it's marked readonly in pickle.
+    # The type: ignore is because mypy can't statically determine the type of this class.
+    class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
+        # from https://stackoverflow.com/questions/13398462/unpickling-python-objects-with-a-changed-module-path/13405732
+        # Lets us override the imports that pickle uses when unpickling an object.
+        # This is useful for maintaining BC if we change a module path that tensor instantiation relies on.
+        def find_class(self, mod_name, name):
+            if type(name) is str and "Storage" in name:
+                try:
+                    return StorageType(name)
+                except KeyError:
+                    pass
+            mod_name = load_module_mapping.get(mod_name, mod_name)
+            return super().find_class(mod_name, name)
+
+    # Load the data (which may in turn use `persistent_load` to load tensors)
+    data_file = io.BytesIO(zip_file.get_record(pickle_file))
+
+    unpickler = UnpicklerWrapper(data_file, **pickle_load_args)
+    unpickler.persistent_load = persistent_load
+    # Needed for tensors where storage device and rebuild tensor device are
+    # not connected (wrapper subclasses and tensors rebuilt using numpy)
+    global _serialization_tls
+    _serialization_tls.map_location = map_location
+    result = unpickler.load()
+    _serialization_tls.map_location = None
+
+    torch._utils._validate_loaded_sparse_tensors()
+    torch._C._log_api_usage_metadata(
+        "torch.load.metadata", {"serialization_id": zip_file.serialization_id()}
+    )
+    return result
+
+
+def _is_torchscript_zip(zip_file):
+    return "constants.pkl" in zip_file.get_all_records()
diff --git a/deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py b/deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py
new file mode 100644
index 000000000..ad23e92cc
--- /dev/null
+++ b/deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py
@@ -0,0 +1,1975 @@
+# mypy: allow-untyped-defs
+import copyreg
+import difflib
+import functools
+import io
+import os
+import pickle
+import re
+import shutil
+import struct
+import sys
+import tarfile
+import tempfile
+import threading
+import warnings
+from contextlib import closing, contextmanager
+from enum import Enum
+from typing import (
+    Any,
+    BinaryIO,
+    Callable,
+    cast,
+    Dict,
+    IO,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+from typing_extensions import TypeAlias, TypeIs
+
+import torch
+import torch._weights_only_unpickler as _weights_only_unpickler
+from torch._sources import get_source_lines_and_file
+from torch._utils import _import_dotted_name
+from torch.storage import _get_dtype_from_pickle_storage_type
+from torch.types import Storage
+
+
+__all__ = [
+    "SourceChangeWarning",
+    "mkdtemp",
+    "register_package",
+    "check_module_version_greater_or_equal",
+    "validate_cuda_device",
+    "validate_hpu_device",
+    "location_tag",
+    "default_restore_location",
+    "normalize_storage_type",
+    "storage_to_tensor_type",
+    "save",
+    "load",
+    "StorageType",
+    "LoadEndianness",
+    "get_crc32_options",
+    "set_crc32_options",
+    "get_default_load_endianness",
+    "set_default_load_endianness",
+    "get_default_mmap_options",
+    "set_default_mmap_options",
+    "clear_safe_globals",
+    "get_safe_globals",
+    "add_safe_globals",
+    "safe_globals",
+    "get_unsafe_globals_in_checkpoint",
+    "skip_data",
+]
+
+DEFAULT_PROTOCOL = 2
+
+LONG_SIZE = struct.Struct("=l").size
+INT_SIZE = struct.Struct("=i").size
+SHORT_SIZE = struct.Struct("=h").size
+
+MAGIC_NUMBER = 0x1950A86A20F9469CFC6C
+PROTOCOL_VERSION = 1001
+STORAGE_KEY_SEPARATOR = ","
+
+FILE_LIKE: TypeAlias = Union[str, os.PathLike, BinaryIO, IO[bytes]]
+MAP_LOCATION: TypeAlias = Optional[
+    Union[Callable[[Storage, str], Storage], torch.device, str, Dict[str, str]]
+]
+STORAGE: TypeAlias = Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage]
+
+IS_WINDOWS = sys.platform == "win32"
+
+UNSAFE_MESSAGE = (
+    "In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` "
+    "from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, "
+    "but it can result in arbitrary code execution. Do it only if you got the file from a "
+    "trusted source."
+)
+
+if not IS_WINDOWS:
+    from mmap import MAP_PRIVATE, MAP_SHARED
+else:
+    MAP_SHARED, MAP_PRIVATE = None, None  # type: ignore[assignment]
+
+
+def _default_to_weights_only(pickle_module):
+    is_fbcode = not hasattr(torch.version, "git_version")
+    return pickle_module is None and not is_fbcode
+
+
+# _serialization_tls is used to store thread local state specific to serialization
+# that needs to be propagated to other files, in particular we use this for
+# (1) map_location (needed for wrapper subclasses/third party devices to torch._utils)
+# (2) skip_data (needed for torch.Tensor.__reduce_ex__ for skip_data ctx)
+# (3) materialize_fake_tensors (needed for torch.Tensor.__reduce_ex__ for skip_data ctx)
+class _SerializationLocal(threading.local):
+    def __init__(self):
+        super().__init__()
+        self.map_location: Optional[MAP_LOCATION] = None
+        self.skip_data: bool = False
+        self.materialize_fake_tensors: bool = False
+
+
+_serialization_tls = _SerializationLocal()
+
+
+class SourceChangeWarning(Warning):
+    pass
+
+
+@contextmanager
+def mkdtemp():
+    path = tempfile.mkdtemp()
+    try:
+        yield path
+    finally:
+        shutil.rmtree(path)
+
+
+_package_registry: List[
+    Tuple[
+        int,
+        Callable[[STORAGE], Optional[str]],
+        Callable[[STORAGE, str], Optional[STORAGE]],
+    ]
+] = []
+
+
+class LoadEndianness(Enum):
+    NATIVE = 1
+    LITTLE = 2
+    BIG = 3
+
+
+_default_load_endian: Optional[LoadEndianness] = None
+
+
+def get_default_load_endianness() -> Optional[LoadEndianness]:
+    """
+    Get fallback byte order for loading files
+
+    If byteorder mark is not present in saved checkpoint,
+    this byte order is used as fallback.
+    By default, it's "native" byte order.
+
+    Returns:
+        default_load_endian: Optional[LoadEndianness]
+    """
+    return _default_load_endian
+
+
+def set_default_load_endianness(endianness):
+    """
+    Set fallback byte order for loading files
+
+    If byteorder mark is not present in saved checkpoint,
+    this byte order is used as fallback.
+    By default, it's "native" byte order.
+
+    Args:
+        endianness: the new fallback byte order
+    """
+    global _default_load_endian
+    if not isinstance(endianness, LoadEndianness) and endianness is not None:
+        raise TypeError("Invalid argument type in function set_default_load_endianness")
+    _default_load_endian = endianness
+
+
+_compute_crc32: bool = True
+
+
+def get_crc32_options() -> bool:
+    """
+    Get whether :func:`torch.save` computes and writes crc32 for each record.
+
+    Defaults to ``True``.
+    """
+    return _compute_crc32
+
+
+def set_crc32_options(compute_crc32: bool):
+    """
+    Set whether :func:`torch.save` computes and writes crc32 for each record.
+
+    .. note::
+        Setting this to ``False`` may make unzipping of the ``torch.save`` output
+        fail or warn due to corrupted CRC32. However ``torch.load`` will be
+        able to load the file.
+
+    Args:
+        compute_crc32 (bool): set crc32 compuation flag
+    """
+    global _compute_crc32
+    _compute_crc32 = compute_crc32
+
+
+_default_mmap_options: int = MAP_PRIVATE
+
+
+def get_default_mmap_options() -> int:
+    """
+    Get default mmap options for :func:`torch.load` with ``mmap=True``.
+
+    Defaults to ``mmap.MAP_PRIVATE``.
+
+
+    Returns:
+        default_mmap_options: int
+    """
+    return _default_mmap_options
+
+
+class set_default_mmap_options:
+    """
+    Context manager or function to set default mmap options for :func:`torch.load` with ``mmap=True`` to flags.
+
+    For now, only either ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED`` are supported.
+    Please open an issue if you need any other option to be added here.
+
+    .. note::
+        This feature is currently not supported for Windows.
+
+    Args:
+        flags: ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED``
+    """
+
+    def __init__(self, flags: int) -> None:
+        if IS_WINDOWS:
+            raise RuntimeError(
+                "Changing the default mmap options is currently not supported for Windows"
+            )
+        if flags != MAP_PRIVATE and flags != MAP_SHARED:
+            raise ValueError(
+                "Invalid argument in function set_default_mmap_options, "
+                f"expected mmap.MAP_PRIVATE or mmap.MAP_SHARED, but got {flags}"
+            )
+        global _default_mmap_options
+        self.prev = _default_mmap_options
+        _default_mmap_options = flags
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        global _default_mmap_options
+        _default_mmap_options = self.prev
+
+
+def clear_safe_globals() -> None:
+    """
+    Clears the list of globals that are safe for ``weights_only`` load.
+    """
+    _weights_only_unpickler._clear_safe_globals()
+
+
+def get_safe_globals() -> List[Union[Callable, Tuple[Callable, str]]]:
+    """
+    Returns the list of user-added globals that are safe for ``weights_only`` load.
+    """
+    return _weights_only_unpickler._get_safe_globals()
+
+
+def add_safe_globals(safe_globals: List[Union[Callable, Tuple[Callable, str]]]) -> None:
+    """
+    Marks the given globals as safe for ``weights_only`` load. For example, functions
+    added to this list can be called during unpickling, classes could be instantiated
+    and have state set.
+
+    Each item in the list can either be a function/class or a tuple of the form
+    (function/class, string) where string is the full path of the function/class.
+
+    Within the serialized format, each function is identified with its full
+    path as ``{__module__}.{__name__}``. When calling this API, you can provide this
+    full path that should match the one in the checkpoint otherwise the default
+    ``{fn.__module__}.{fn.__name__}`` will be used.
+
+    Args:
+        safe_globals (List[Union[Callable, Tuple[Callable, str]]]): list of globals to mark as safe
+
+    Example:
+        >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization")
+        >>> import tempfile
+        >>> class MyTensor(torch.Tensor):
+        ...     pass
+        >>> t = MyTensor(torch.randn(2, 3))
+        >>> with tempfile.NamedTemporaryFile() as f:
+        ...     torch.save(t, f.name)
+        # Running `torch.load(f.name, weights_only=True)` will fail with
+        # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default.
+        # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint.
+        ...     torch.serialization.add_safe_globals([MyTensor])
+        ...     torch.load(f.name, weights_only=True)
+        # MyTensor([[-0.5024, -1.8152, -0.5455],
+        #          [-0.8234,  2.0500, -0.3657]])
+    """
+    _weights_only_unpickler._add_safe_globals(safe_globals)
+
+
+class safe_globals(_weights_only_unpickler._safe_globals):
+    r"""Context-manager that adds certain globals as safe for ``weights_only`` load.
+
+    Args:
+        safe_globals: List of globals for weights_only load.
+
+    Example:
+        >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization")
+        >>> import tempfile
+        >>> class MyTensor(torch.Tensor):
+        ...     pass
+        >>> t = MyTensor(torch.randn(2, 3))
+        >>> with tempfile.NamedTemporaryFile() as f:
+        ...     torch.save(t, f.name)
+        # Running `torch.load(f.name, weights_only=True)` will fail with
+        # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default.
+        # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint.
+        ...     with torch.serialization.safe_globals([MyTensor]):
+        ...         torch.load(f.name, weights_only=True)
+        # MyTensor([[-0.5024, -1.8152, -0.5455],
+        #          [-0.8234,  2.0500, -0.3657]])
+        >>> assert torch.serialization.get_safe_globals() == []
+    """
+
+
+def get_unsafe_globals_in_checkpoint(f: FILE_LIKE) -> List[str]:
+    """Returns a list of strings of functions/classes in a ``torch.save`` object that are not safe for ``weights_only``.
+
+    For a given function or class ``f``, the corresponding string will be of the form
+    ``{f.__module__}.{f.__name__}``.
+
+    This function will return any GLOBALs in the checkpoint that are not in the set marked safe
+    for ``weights_only`` (either via :func:`add_safe_globals` or :class:`safe_globals` context or
+    allowlisted by ``torch`` by default).
+
+    .. note::
+        This function will statically disassemble the pickle file in the checkpoint.
+        The implication is any classes dynamically pushed onto the stack during unpickling
+        will not be included in the output.
+
+    Args:
+        f: File-like object or string containing the checkpoint object saved via ``torch.save``
+
+    Returns:
+        A list of strings of pickle GLOBALs in the checkpoint that are not allowlisted for ``weights_only``.
+    """
+    default_safe_globals_strings = set(
+        _weights_only_unpickler._get_allowed_globals().keys()
+    )
+    user_safe_global_strings = set(
+        _weights_only_unpickler._get_user_allowed_globals().keys()
+    )
+    safe_global_strings = default_safe_globals_strings.union(user_safe_global_strings)
+
+    with _open_file_like(f, "rb") as opened_file:
+        if not _is_zipfile(opened_file):
+            raise ValueError("Expected input to be a checkpoint returned by torch.save")
+        with _open_zipfile_reader(opened_file) as zip_file:
+            if _is_torchscript_zip(zip_file):
+                raise ValueError(
+                    "Expected input to be a checkpoint returned by torch.save but got a torchscript checkpoint"
+                )
+            data_file = io.BytesIO(zip_file.get_record("data.pkl"))
+            all_globals = _weights_only_unpickler.get_globals_in_pkl(data_file)
+            return list(all_globals.difference(safe_global_strings))
+
+
+class skip_data:
+    """
+    Context-manager that skips writing storage bytes for ``torch.save`` calls.
+
+    Storages will still be saved, but the space that their bytes would usually be written to
+    will be empty space. The storage bytes can then be populated in a separate pass.
+
+    .. warning::
+        The ``skip_data`` context manager is an early prototype and is subject to change.
+
+    Args:
+        materialize_fake_tensors: Whether to materialize FakeTensors.
+
+    Example:
+        >>> # xdoctest: +SKIP("NamedTemporaryFile on Windows")
+        >>> import tempfile
+        >>> t = torch.randn(2, 3)
+        >>> with tempfile.NamedTemporaryFile() as f:
+        ...     with torch.serialization.skip_data():
+        ...         torch.save(t, f.name)
+        ...     torch.load(f.name, weights_only=True)
+        tensor([[0., 0., 0.],
+                [0., 0., 0.]])
+    """
+
+    def __init__(self, materialize_fake_tensors: bool = False):
+        self.materialize_fake_tensors = materialize_fake_tensors
+
+    def __enter__(self):
+        global _serialization_tls
+        self._old_skip_data = _serialization_tls.skip_data
+        self._old_materialize_fake_tensors = _serialization_tls.materialize_fake_tensors
+        _serialization_tls.skip_data = True
+        _serialization_tls.materialize_fake_tensors = self.materialize_fake_tensors
+
+    def __exit__(self, type, value, tb):
+        global _serialization_tls
+        _serialization_tls.skip_data = self._old_skip_data
+        _serialization_tls.materialize_fake_tensors = self._old_materialize_fake_tensors
+
+
+def _is_zipfile(f) -> bool:
+    # This is a stricter implementation than zipfile.is_zipfile().
+    # zipfile.is_zipfile() is True if the magic number appears anywhere in the
+    # binary. Since we expect the files here to be generated by torch.save or
+    # torch.jit.save, it's safe to only check the start bytes and avoid
+    # collisions and assume the zip has only 1 file.
+    # See bugs.python.org/issue28494.
+
+    start = f.tell()
+    # Read the first few bytes and match against the ZIP file signature
+    local_header_magic_number = b"PK\x03\x04"
+    read_bytes = f.read(len(local_header_magic_number))
+    f.seek(start)
+    return read_bytes == local_header_magic_number
+
+
+def register_package(
+    priority: int,
+    tagger: Callable[[STORAGE], Optional[str]],
+    deserializer: Callable[[STORAGE, str], Optional[STORAGE]],
+):
+    """
+    Registers callables for tagging and deserializing storage objects with an associated priority.
+    Tagging associates a device with a storage object at save time while deserializing moves a
+    storage object to an appropriate device at load time. :attr:`tagger` and :attr:`deserializer`
+    are run in the order given by their :attr:`priority` until a tagger/deserializer returns a
+    value that is not `None`.
+
+    To override the deserialization behavior for a device in the global registry, one can register a
+    tagger with a higher priority than the existing tagger.
+
+    This function can also be used to register a tagger and deserializer for new devices.
+
+    Args:
+        priority: Indicates the priority associated with the tagger and deserializer, where a lower
+            value indicates higher priority.
+        tagger: Callable that takes in a storage object and returns its tagged device as a string
+            or None.
+        deserializer: Callable that takes in storage object and a device string and returns a storage
+            object on the appropriate device or None.
+
+    Returns:
+        `None`
+
+    Example:
+        >>> def ipu_tag(obj):
+        >>>     if obj.device.type == 'ipu':
+        >>>         return 'ipu'
+        >>> def ipu_deserialize(obj, location):
+        >>>     if location.startswith('ipu'):
+        >>>         ipu = getattr(torch, "ipu", None)
+        >>>         assert ipu is not None, "IPU device module is not loaded"
+        >>>         assert torch.ipu.is_available(), "ipu is not available"
+        >>>         return obj.ipu(location)
+        >>> torch.serialization.register_package(11, ipu_tag, ipu_deserialize)
+    """
+    queue_elem = (priority, tagger, deserializer)
+    _package_registry.append(queue_elem)
+    _package_registry.sort()
+
+
+def check_module_version_greater_or_equal(
+    module,
+    req_version_tuple,
+    error_if_malformed=True,
+):
+    """
+    Check if a module's version satisfies requirements
+
+    Usually, a module's version string will be like 'x.y.z', which would be represented
+    as a tuple (x, y, z), but sometimes it could be an unexpected format. If the version
+    string does not match the given tuple's format up to the length of the tuple, then
+    error and exit or emit a warning.
+
+    Args:
+        module: the module to check the version of
+        req_version_tuple: tuple (usually of ints) representing the required version
+        error_if_malformed: whether we should exit if module version string is malformed
+
+    Returns:
+        requirement_is_met: bool
+    """
+    try:
+        version_strs = module.__version__.split(".")
+        # Cast module version fields to match the types of the required version
+        module_version = tuple(
+            type(req_field)(version_strs[idx])
+            for idx, req_field in enumerate(req_version_tuple)
+        )
+        requirement_is_met = module_version >= req_version_tuple
+
+    except Exception as e:
+        message = (
+            f"'{module.__name__}' module version string is malformed '{module.__version__}' and cannot be compared"
+            f" with tuple {str(req_version_tuple)}"
+        )
+        if error_if_malformed:
+            raise RuntimeError(message) from e
+        else:
+            warnings.warn(message + ", but continuing assuming that requirement is met")
+            requirement_is_met = True
+
+    return requirement_is_met
+
+
+def _cpu_tag(obj):
+    if obj.device.type == "cpu":
+        return "cpu"
+
+
+def _mps_tag(obj):
+    if obj.device.type == "mps":
+        return "mps"
+
+
+def _meta_tag(obj):
+    if obj.device.type == "meta":
+        return "meta"
+
+
+def _backend_tag(backend_name, obj):
+    if backend_name == "privateuse1":
+        backend_name = torch._C._get_privateuse1_backend_name()
+    if obj.device.type == backend_name:
+        if obj.device.index is None:
+            return backend_name
+        else:
+            return backend_name + ":" + str(obj.device.index)
+
+
+def _cpu_deserialize(obj, location):
+    if location == "cpu":
+        return obj
+
+
+def _mps_deserialize(obj, location):
+    if location.startswith("mps"):
+        return obj.mps()
+
+
+def _meta_deserialize(obj, location):
+    if location == "meta":
+        return torch.UntypedStorage(obj.nbytes(), device="meta")
+
+
+def _validate_device(location, backend_name):
+    """
+    Check whether the device index of specified backend is valid
+
+    In case of privateuse1 backend, your must first register a device_module for
+    privateuse1 using torch._register_device_module. Implement the following
+    methods in device_module like cuda: device_module._utils._get_device_index(location, True),
+    device_module.device_count().
+
+    Args:
+        location: string of device
+        backend_name: the backend name or the name of privateuse1, which can be renamed
+
+    Returns:
+        device_index: int
+    """
+    if not hasattr(torch, backend_name):
+        raise RuntimeError(
+            f"The {backend_name.upper()} device module is not registered. "
+            "If you are running on a CPU-only machine, "
+            "please use torch.load with map_location=torch.device('cpu') "
+            "to map your storages to the CPU."
+        )
+    device_module = getattr(torch, backend_name)
+    if hasattr(device_module, "_utils") and hasattr(
+        device_module._utils, "_get_device_index"
+    ):
+        device_index = device_module._utils._get_device_index(location, True)
+        device = torch.device(backend_name, device_index)
+    else:
+        device = torch.device(location)
+        device_index = device.index if device.index else 0
+    if hasattr(device_module, "is_available") and not device_module.is_available():
+        raise RuntimeError(
+            f"Attempting to deserialize object on a {backend_name.upper()} "
+            f"device but torch.{backend_name}.is_available() is False. "
+            "If you are running on a CPU-only machine, "
+            "please use torch.load with map_location=torch.device('cpu') "
+            "to map your storages to the CPU."
+        )
+    if hasattr(device_module, "device_count"):
+        device_count = device_module.device_count()
+        if device_index >= device_count:
+            raise RuntimeError(
+                f"Attempting to deserialize object on {backend_name.upper()} device "
+                f"{device_index} but torch.{backend_name}.device_count() is {device_count}. "
+                "Please use torch.load with map_location to map your storages "
+                "to an existing device."
+            )
+    return device
+
+
+def validate_cuda_device(location):
+    return _validate_device(location, "cuda").index
+
+
+def validate_hpu_device(location):
+    return _validate_device(location, "hpu").index
+
+
+def _deserialize(backend_name, obj, location):
+    if backend_name == "privateuse1":
+        backend_name = torch._C._get_privateuse1_backend_name()
+    if location.startswith(backend_name):
+        device = _validate_device(location, backend_name)
+        return obj.to(device=device)
+
+
+register_package(10, _cpu_tag, _cpu_deserialize)
+register_package(
+    20,
+    functools.partial(_backend_tag, "cuda"),
+    functools.partial(_deserialize, "cuda"),
+)
+register_package(21, _mps_tag, _mps_deserialize)
+register_package(22, _meta_tag, _meta_deserialize)
+register_package(
+    23,
+    functools.partial(_backend_tag, "privateuse1"),
+    functools.partial(_deserialize, "privateuse1"),
+)
+register_package(
+    24,
+    functools.partial(_backend_tag, "hpu"),
+    functools.partial(_deserialize, "hpu"),
+)
+register_package(
+    25,
+    functools.partial(_backend_tag, "xpu"),
+    functools.partial(_deserialize, "xpu"),
+)
+
+
+def location_tag(
+    storage: Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage],
+):
+    for _, tagger, _ in _package_registry:
+        location = tagger(storage)
+        if location:
+            return location
+    raise RuntimeError(
+        "don't know how to determine data location of " + torch.typename(storage)
+    )
+
+
+def default_restore_location(storage, location):
+    """
+    Restores `storage` using a deserializer function registered for the `location`.
+
+    This function looks in the registry for deserializer functions that match the `location`.
+    If found, it attempts to use them, in priority order, to restore `storage` until one
+    returns a not `None` result. If no deserializer can be found in the registry, or all found fail
+    to bear a result, it raises a `RuntimeError`.
+
+    Args:
+        storage (STORAGE): the storage object to restore
+        location (str): the location tag associated with the storage object
+
+    Returns:
+        storage: Optional[STORAGE]
+
+    Raises:
+        RuntimeError: If no deserializer matching `location` is found in the registry or if
+           all matching ones return `None`.
+    """
+    for _, _, fn in _package_registry:
+        result = fn(storage, location)
+        if result is not None:
+            return result
+    raise RuntimeError(
+        "don't know how to restore data location of "
+        + torch.typename(storage)
+        + " (tagged with "
+        + location
+        + ")"
+    )
+
+
+def normalize_storage_type(storage_type):
+    return getattr(torch, storage_type.__name__)
+
+
+def storage_to_tensor_type(storage):
+    storage_type = type(storage)
+    module = _import_dotted_name(storage_type.__module__)
+    return getattr(module, storage_type.__name__.replace("Storage", "Tensor"))
+
+
+def _is_path(name_or_buffer) -> TypeIs[Union[str, os.PathLike]]:
+    return isinstance(name_or_buffer, (str, os.PathLike))
+
+
+class _opener:
+    def __init__(self, file_like):
+        self.file_like = file_like
+
+    def __enter__(self):
+        return self.file_like
+
+    def __exit__(self, *args):
+        pass
+
+
+class _open_file(_opener):
+    def __init__(self, name, mode):
+        super().__init__(open(name, mode))
+
+    def __exit__(self, *args):
+        self.file_like.close()
+
+
+class _open_buffer_reader(_opener):
+    def __init__(self, buffer):
+        super().__init__(buffer)
+        _check_seekable(buffer)
+
+
+class _open_buffer_writer(_opener):
+    def __exit__(self, *args):
+        self.file_like.flush()
+
+
+def _open_file_like(name_or_buffer, mode):
+    if _is_path(name_or_buffer):
+        return _open_file(name_or_buffer, mode)
+    else:
+        if "w" in mode:
+            return _open_buffer_writer(name_or_buffer)
+        elif "r" in mode:
+            return _open_buffer_reader(name_or_buffer)
+        else:
+            raise RuntimeError(f"Expected 'r' or 'w' in mode but got {mode}")
+
+
+class _open_zipfile_reader(_opener):
+    def __init__(self, name_or_buffer) -> None:
+        super().__init__(torch._C.PyTorchFileReader(name_or_buffer))
+
+
+class _open_zipfile_writer_file(_opener):
+    def __init__(self, name) -> None:
+        self.file_stream = None
+        self.name = str(name)
+        try:
+            self.name.encode("ascii")
+        except UnicodeEncodeError:
+            # PyTorchFileWriter only supports ascii filename.
+            # For filenames with non-ascii characters, we rely on Python
+            # for writing out the file.
+            self.file_stream = io.FileIO(self.name, mode="w")
+            super().__init__(
+                torch._C.PyTorchFileWriter(self.file_stream, _compute_crc32)
+            )
+        else:
+            super().__init__(torch._C.PyTorchFileWriter(self.name, _compute_crc32))
+
+    def __exit__(self, *args) -> None:
+        self.file_like.write_end_of_file()
+        if self.file_stream is not None:
+            self.file_stream.close()
+
+
+class _open_zipfile_writer_buffer(_opener):
+    def __init__(self, buffer) -> None:
+        if not callable(getattr(buffer, "write", None)):
+            msg = f"Buffer of {str(type(buffer)).strip('<>')} has no callable attribute 'write'"
+            if not hasattr(buffer, "write"):
+                raise AttributeError(msg)
+            raise TypeError(msg)
+        self.buffer = buffer
+        super().__init__(torch._C.PyTorchFileWriter(buffer, _compute_crc32))
+
+    def __exit__(self, *args) -> None:
+        self.file_like.write_end_of_file()
+        self.buffer.flush()
+
+
+def _open_zipfile_writer(name_or_buffer):
+    container: Type[_opener]
+    if _is_path(name_or_buffer):
+        container = _open_zipfile_writer_file
+    else:
+        container = _open_zipfile_writer_buffer
+    return container(name_or_buffer)
+
+
+def _is_compressed_file(f) -> bool:
+    compress_modules = ["gzip"]
+    try:
+        return f.__module__ in compress_modules
+    except AttributeError:
+        return False
+
+
+def _should_read_directly(f):
+    """
+    Checks if f is a file that should be read directly. It should be read
+    directly if it is backed by a real file (has a fileno) and is not a
+    a compressed file (e.g. gzip)
+    """
+    if _is_compressed_file(f):
+        return False
+    try:
+        return f.fileno() >= 0
+    except io.UnsupportedOperation:
+        return False
+    except AttributeError:
+        return False
+
+
+def _check_seekable(f) -> bool:
+    def raise_err_msg(patterns, e):
+        for p in patterns:
+            if p in str(e):
+                msg = (
+                    str(e)
+                    + ". You can only torch.load from a file that is seekable."
+                    + " Please pre-load the data into a buffer like io.BytesIO and"
+                    + " try to load from it instead."
+                )
+                raise type(e)(msg)
+        raise e
+
+    try:
+        f.seek(f.tell())
+        return True
+    except (io.UnsupportedOperation, AttributeError) as e:
+        raise_err_msg(["seek", "tell"], e)
+    return False
+
+
+def _check_dill_version(pickle_module) -> None:
+    """Checks if using dill as the pickle module, and if so, checks if it is the correct version.
+    If dill version is lower than 0.3.1, a ValueError is raised.
+
+    Args:
+        pickle_module: module used for pickling metadata and objects
+
+    """
+    if pickle_module is not None and pickle_module.__name__ == "dill":
+        required_dill_version = (0, 3, 1)
+        if not check_module_version_greater_or_equal(
+            pickle_module, required_dill_version, False
+        ):
+            raise ValueError(
+                (
+                    "'torch' supports dill >= {}, but you have dill {}."
+                    " Please upgrade dill or switch to 'pickle'"
+                ).format(
+                    ".".join([str(num) for num in required_dill_version]),
+                    pickle_module.__version__,
+                )
+            )
+
+
+def _check_save_filelike(f):
+    if not _is_path(f) and not hasattr(f, "write"):
+        raise AttributeError(
+            "expected 'f' to be string, path, or a file-like object with "
+            "a 'write' attribute"
+        )
+
+
+def save(
+    obj: object,
+    f: FILE_LIKE,
+    pickle_module: Any = pickle,
+    pickle_protocol: int = DEFAULT_PROTOCOL,
+    _use_new_zipfile_serialization: bool = True,
+    _disable_byteorder_record: bool = False,
+) -> None:
+    # Reference: https://github.com/pytorch/pytorch/issues/54354
+    # The first line of this docstring overrides the one Sphinx generates for the
+    # documentation. We need it so that Sphinx doesn't leak `pickle`s path from
+    # the build environment (e.g. `<module 'pickle' from '/leaked/path').
+
+    """save(obj, f, pickle_module=pickle, pickle_protocol=2, _use_new_zipfile_serialization=True)
+
+    Saves an object to a disk file.
+
+    See also: :ref:`saving-loading-tensors`
+
+    Args:
+        obj: saved object
+        f: a file-like object (has to implement write and flush) or a string or
+           os.PathLike object containing a file name
+        pickle_module: module used for pickling metadata and objects
+        pickle_protocol: can be specified to override the default protocol
+
+    .. note::
+        A common PyTorch convention is to save tensors using .pt file extension.
+
+    .. note::
+        PyTorch preserves storage sharing across serialization. See
+        :ref:`preserve-storage-sharing` for more details.
+
+    .. note::
+        The 1.6 release of PyTorch switched ``torch.save`` to use a new
+        zipfile-based file format. ``torch.load`` still retains the ability to
+        load files in the old format. If for any reason you want ``torch.save``
+        to use the old format, pass the kwarg ``_use_new_zipfile_serialization=False``.
+
+    Example:
+        >>> # xdoctest: +SKIP("makes cwd dirty")
+        >>> # Save to file
+        >>> x = torch.tensor([0, 1, 2, 3, 4])
+        >>> torch.save(x, "tensor.pt")
+        >>> # Save to io.BytesIO buffer
+        >>> buffer = io.BytesIO()
+        >>> torch.save(x, buffer)
+    """
+    torch._C._log_api_usage_once("torch.save")
+    _check_dill_version(pickle_module)
+    _check_save_filelike(f)
+
+    if _use_new_zipfile_serialization:
+        with _open_zipfile_writer(f) as opened_zipfile:
+            _save(
+                obj,
+                opened_zipfile,
+                pickle_module,
+                pickle_protocol,
+                _disable_byteorder_record,
+            )
+            return
+    else:
+        global _serialization_tls
+        if _serialization_tls.skip_data:
+            raise RuntimeError(
+                "Cannot use skip_data=True with _use_new_zipfile_serialization=False"
+            )
+        with _open_file_like(f, "wb") as opened_file:
+            _legacy_save(obj, opened_file, pickle_module, pickle_protocol)
+
+
+def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None:
+    import torch.nn as nn
+
+    serialized_container_types = {}
+    serialized_storages: Dict[str, Tuple[torch.UntypedStorage, torch.dtype]] = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    # TODO: This feature could be added in the future
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj: Any) -> Optional[Tuple]:
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            if obj in serialized_container_types:
+                return None
+            serialized_container_types[obj] = True
+            source_file = source = None
+            try:
+                source_lines, _, source_file = get_source_lines_and_file(obj)
+                source = "".join(source_lines)
+            except (
+                Exception
+            ):  # saving the source is optional, so we can ignore any errors
+                warnings.warn(
+                    "Couldn't retrieve source code for container of "
+                    "type " + obj.__name__ + ". It won't be checked "
+                    "for correctness upon loading."
+                )
+            return ("module", obj, source_file, source)
+
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            storage: torch.UntypedStorage
+
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                dtype = obj.dtype
+                storage_numel = obj._size()
+
+            elif isinstance(obj, torch.UntypedStorage):
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                dtype = torch.uint8
+                storage_numel = storage.nbytes()
+            else:
+                raise TypeError(f"type not recognized: {type(obj)}")
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that "
+                            "view the same data as different types"
+                        )
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            view_metadata: Optional[Tuple[str, int, int]]
+
+            # Offset is always 0, but we keep it for backwards compatibility
+            # with the old serialization format (which supported storage views)
+            offset = 0
+            storage_key = str(storage._cdata)
+            location = location_tag(storage)
+
+            # TODO: There's an issue here with FC. It might be impossible to
+            # solve, but it's worth noting. Imagine we save a list `[storage,
+            # tensor]`, where `tensor.storage()` is the same as `storage`, and
+            # `tensor.element_size() > 1`. Let's say that `tensor.dtype ==
+            # torch.float`.  The storage will be serialized with element size
+            # of 1, since we're choosing to serialize the first occurance of
+            # a duplicate storage. Since this legacy serialization format saves
+            # the numel of the storage, rather than nbytes directly, we'll be
+            # effectively saving nbytes in this case.  We'll be able to load it
+            # and the tensor back up with no problems in _this_ and future
+            # versions of pytorch, but in older versions, here's the problem:
+            # the storage will be loaded up as a UntypedStorage, and then the
+            # FloatTensor will loaded and the UntypedStorage will be assigned to
+            # it. Since the storage dtype does not match the tensor dtype, this
+            # will cause an error.  If we reverse the list, like `[tensor,
+            # storage]`, then we will save the `tensor.storage()` as a faked
+            # `FloatStorage`, and the saved size will be the correct
+            # dtype-specific numel count that old versions expect. `tensor`
+            # will be able to load up properly in old versions, pointing to
+            # a FloatStorage. However, `storage` is still being translated to
+            # a UntypedStorage, and it will try to resolve to the same
+            # FloatStorage that `tensor` contains. This will also cause an
+            # error. It doesn't seem like there's any way around this.
+            # Probably, we just cannot maintain FC for the legacy format if the
+            # saved list contains both a tensor and a storage that point to the
+            # same data.  We should still be able to maintain FC for lists of
+            # just tensors, as long as all views share the same dtype as the
+            # tensor they are viewing.
+
+            if storage_key not in serialized_storages:
+                serialized_storages[storage_key] = (storage, dtype)
+            is_view = storage._cdata != storage._cdata
+            if is_view:
+                view_metadata = (str(storage._cdata), offset, storage.nbytes())
+            else:
+                view_metadata = None
+
+            res = (
+                "storage",
+                storage_type,
+                storage_key,
+                location,
+                storage_numel,
+                view_metadata,
+            )
+            return res
+        return None
+
+    sys_info = dict(
+        protocol_version=PROTOCOL_VERSION,
+        little_endian=sys.byteorder == "little",
+        type_sizes=dict(
+            short=SHORT_SIZE,
+            int=INT_SIZE,
+            long=LONG_SIZE,
+        ),
+    )
+
+    pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol)
+    pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol)
+    pickle_module.dump(sys_info, f, protocol=pickle_protocol)
+
+    class PyTorchLegacyPickler(pickle_module.Pickler):
+        def persistent_id(self, obj):
+            return persistent_id(obj)
+
+    pickler = PyTorchLegacyPickler(f, protocol=pickle_protocol)
+    pickler.dump(obj)
+
+    serialized_storage_keys = sorted(serialized_storages.keys())
+    pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol)
+    f.flush()
+    for key in serialized_storage_keys:
+        storage, dtype = serialized_storages[key]
+        storage._write_file(
+            f, _should_read_directly(f), True, torch._utils._element_size(dtype)
+        )
+
+
+def _save(
+    obj,
+    zip_file,
+    pickle_module,
+    pickle_protocol,
+    _disable_byteorder_record,
+):
+    serialized_storages = {}
+    id_map: Dict[int, str] = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    # TODO: This feature could be added in the future
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage_numel = obj._size()
+
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                storage_numel = storage.nbytes()
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if str(storage.device) != "meta" and storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that "
+                            "view the same data as different types"
+                        )
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            storage_key = id_map.setdefault(storage._cdata, str(len(id_map)))
+            if hasattr(obj, "_fake_device") and obj._fake_device is not None:
+                location = str(obj._fake_device)
+            else:
+                location = location_tag(storage)
+            serialized_storages[storage_key] = storage
+
+            return ("storage", storage_type, storage_key, location, storage_numel)
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+
+    class PyTorchPickler(pickle_module.Pickler):  # type: ignore[name-defined]
+        def persistent_id(self, obj):
+            return persistent_id(obj)
+
+    pickler = PyTorchPickler(data_buf, protocol=pickle_protocol)
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    zip_file.write_record("data.pkl", data_value, len(data_value))
+
+    # Write byte order marker
+    if not _disable_byteorder_record:
+        if sys.byteorder not in ["little", "big"]:
+            raise ValueError("Unknown endianness type: " + sys.byteorder)
+
+        zip_file.write_record("byteorder", sys.byteorder, len(sys.byteorder))
+
+    # Write each tensor to a file named tensor/the_tensor_key in the zip archive
+    for key in sorted(serialized_storages.keys()):
+        name = f"data/{key}"
+        storage = serialized_storages[key]
+        num_bytes = storage.nbytes()
+        global _serialization_tls
+        if _serialization_tls.skip_data:
+            zip_file.write_record_metadata(name, num_bytes)
+        else:
+            # given that we copy things around anyway, we might use storage.cpu()
+            # this means to that to get tensors serialized, you need to implement
+            # .cpu() on the underlying Storage
+            if storage.device.type != "cpu":
+                storage = storage.cpu()
+            # Now that it is on the CPU we can directly copy it into the zip file
+            zip_file.write_record(name, storage, num_bytes)
+
+
+def load(
+    f: FILE_LIKE,
+    map_location: MAP_LOCATION = None,
+    pickle_module: Any = None,
+    *,
+    weights_only: Optional[bool] = None,
+    mmap: Optional[bool] = None,
+    **pickle_load_args: Any,
+) -> Any:
+    # Reference: https://github.com/pytorch/pytorch/issues/54354
+    # The first line of this docstring overrides the one Sphinx generates for the
+    # documentation. We need it so that Sphinx doesn't leak `pickle`s path from
+    # the build environment (e.g. `<module 'pickle' from '/leaked/path').
+
+    """load(f, map_location=None, pickle_module=pickle, *, weights_only=True, mmap=None, **pickle_load_args)
+
+    Loads an object saved with :func:`torch.save` from a file.
+
+    :func:`torch.load` uses Python's unpickling facilities but treats storages,
+    which underlie tensors, specially. They are first deserialized on the
+    CPU and are then moved to the device they were saved from. If this fails
+    (e.g. because the run time system doesn't have certain devices), an exception
+    is raised. However, storages can be dynamically remapped to an alternative
+    set of devices using the :attr:`map_location` argument.
+
+    If :attr:`map_location` is a callable, it will be called once for each serialized
+    storage with two arguments: storage and location. The storage argument
+    will be the initial deserialization of the storage, residing on the CPU.
+    Each serialized storage has a location tag associated with it which
+    identifies the device it was saved from, and this tag is the second
+    argument passed to :attr:`map_location`. The builtin location tags are ``'cpu'``
+    for CPU tensors and ``'cuda:device_id'`` (e.g. ``'cuda:2'``) for CUDA tensors.
+    :attr:`map_location` should return either ``None`` or a storage. If
+    :attr:`map_location` returns a storage, it will be used as the final deserialized
+    object, already moved to the right device. Otherwise, :func:`torch.load` will
+    fall back to the default behavior, as if :attr:`map_location` wasn't specified.
+
+    If :attr:`map_location` is a :class:`torch.device` object or a string containing
+    a device tag, it indicates the location where all tensors should be loaded.
+
+    Otherwise, if :attr:`map_location` is a dict, it will be used to remap location tags
+    appearing in the file (keys), to ones that specify where to put the
+    storages (values).
+
+    User extensions can register their own location tags and tagging and
+    deserialization methods using :func:`torch.serialization.register_package`.
+
+    Args:
+        f: a file-like object (has to implement :meth:`read`, :meth:`readline`, :meth:`tell`, and :meth:`seek`),
+            or a string or os.PathLike object containing a file name
+        map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
+            locations
+        pickle_module: module used for unpickling metadata and objects (has to
+            match the :attr:`pickle_module` used to serialize file)
+        weights_only: Indicates whether unpickler should be restricted to
+            loading only tensors, primitive types, dictionaries
+            and any types added via :func:`torch.serialization.add_safe_globals`.
+            See :ref:`weights-only` for more details.
+        mmap: Indicates whether the file should be mmaped rather than loading all the storages into memory.
+            Typically, tensor storages in the file will first be moved from disk to CPU memory, after which they
+            are moved to the location that they were tagged with when saving, or specified by ``map_location``. This
+            second step is a no-op if the final location is CPU. When the ``mmap`` flag is set, instead of copying the
+            tensor storages from disk to CPU memory in the first step, ``f`` is mmaped.
+        pickle_load_args: (Python 3 only) optional keyword arguments passed over to
+            :func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g.,
+            :attr:`errors=...`.
+
+    .. warning::
+        :func:`torch.load()` unless `weights_only` parameter is set to `True`,
+        uses ``pickle`` module implicitly, which is known to be insecure.
+        It is possible to construct malicious pickle data which will execute arbitrary code
+        during unpickling. Never load data that could have come from an untrusted
+        source in an unsafe mode, or that could have been tampered with. **Only load data you trust**.
+
+    .. note::
+        When you call :func:`torch.load()` on a file which contains GPU tensors, those tensors
+        will be loaded to GPU by default. You can call ``torch.load(.., map_location='cpu')``
+        and then :meth:`load_state_dict` to avoid GPU RAM surge when loading a model checkpoint.
+
+    .. note::
+        By default, we decode byte strings as ``utf-8``.  This is to avoid a common error
+        case ``UnicodeDecodeError: 'ascii' codec can't decode byte 0x...``
+        when loading files saved by Python 2 in Python 3.  If this default
+        is incorrect, you may use an extra :attr:`encoding` keyword argument to specify how
+        these objects should be loaded, e.g., :attr:`encoding='latin1'` decodes them
+        to strings using ``latin1`` encoding, and :attr:`encoding='bytes'` keeps them
+        as byte arrays which can be decoded later with ``byte_array.decode(...)``.
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined filepaths")
+        >>> torch.load("tensors.pt", weights_only=True)
+        # Load all tensors onto the CPU
+        >>> torch.load("tensors.pt", map_location=torch.device("cpu"), weights_only=True)
+        # Load all tensors onto the CPU, using a function
+        >>> torch.load(
+        ...     "tensors.pt", map_location=lambda storage, loc: storage, weights_only=True
+        ... )
+        # Load all tensors onto GPU 1
+        >>> torch.load(
+        ...     "tensors.pt",
+        ...     map_location=lambda storage, loc: storage.cuda(1),
+        ...     weights_only=True,
+        ... )  # type: ignore[attr-defined]
+        # Map tensors from GPU 1 to GPU 0
+        >>> torch.load("tensors.pt", map_location={"cuda:1": "cuda:0"}, weights_only=True)
+        # Load tensor from io.BytesIO object
+        # Loading from a buffer setting weights_only=False, warning this can be unsafe
+        >>> with open("tensor.pt", "rb") as f:
+        ...     buffer = io.BytesIO(f.read())
+        >>> torch.load(buffer, weights_only=False)
+        # Load a module with 'ascii' encoding for unpickling
+        # Loading from a module setting weights_only=False, warning this can be unsafe
+        >>> torch.load("module.pt", encoding="ascii", weights_only=False)
+    """
+    torch._C._log_api_usage_once("torch.load")
+    DOCS_MESSAGE = (
+        "\n\nCheck the documentation of torch.load to learn more about types accepted by default with "
+        "weights_only https://pytorch.org/docs/stable/generated/torch.load.html."
+    )
+
+    def _get_wo_message(message: str) -> str:
+        unsafe_global_pattern = r"GLOBAL (\S+) was not an allowed global by default."
+        has_unsafe_global = re.search(unsafe_global_pattern, message) is not None
+        blocklist_pattern = r"whose module (\S+) is blocked"
+        has_blocklist = re.search(blocklist_pattern, message) is not None
+        import_pattern = r"(\S+) must be (\S+) to load"
+        has_import = re.search(import_pattern, message) is not None
+        if has_unsafe_global:
+            updated_message = (
+                "Weights only load failed. This file can still be loaded, to do so you have two options, "
+                "\033[1mdo those steps only if you trust the source of the checkpoint\033[0m. "
+                f"\n\t(1) {UNSAFE_MESSAGE}\n\t(2) Alternatively, to load with `weights_only=True` please check "
+                "the recommended steps in the following error message.\n\tWeightsUnpickler error: "
+                + message
+            )
+        else:
+            if has_import:
+                return f"Weights only load failed. {message}\n {UNSAFE_MESSAGE}\n"
+            else:
+                updated_message = f"Weights only load failed. {UNSAFE_MESSAGE}\n"
+                if not has_blocklist:
+                    updated_message += (
+                        "Please file an issue with the following so that we can make "
+                        "`weights_only=True` compatible with your use case: WeightsUnpickler error: "
+                    )
+            updated_message += message
+        return updated_message + DOCS_MESSAGE
+
+    global _serialization_tls
+    skip_data = _serialization_tls.skip_data
+    if skip_data:
+        raise RuntimeError(
+            "`torch.load` called within a torch.serialization.skip_data context manager "
+            "is not supported yet. Please call torch.load outside the skip_data context manager."
+        )
+
+    weights_only_not_set = weights_only is None
+
+    if weights_only_not_set:
+        weights_only = _default_to_weights_only(pickle_module)
+
+    true_values = ["1", "y", "yes", "true"]
+    # Add ability to force safe only or non-safe weight loads via environment variables
+    force_weights_only_load = (
+        os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0") in true_values
+    )
+    force_no_weights_only_load = (
+        os.getenv("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "0") in true_values
+    )
+
+    if force_weights_only_load and force_no_weights_only_load:
+        raise RuntimeError(
+            "Only one of `TORCH_FORCE_WEIGHTS_ONLY_LOAD` or `TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD` "
+            "should be set, but both were set."
+        )
+    elif force_weights_only_load:
+        weights_only = True
+    elif force_no_weights_only_load:
+        # TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD can only override if callsite did not explicitly set weights_only
+        if weights_only_not_set:
+            warnings.warn(
+                "Environment variable TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD detected, since the"
+                "`weights_only` argument was not explicitly passed to `torch.load`, forcing weights_only=False.",
+                UserWarning,
+                stacklevel=2,
+            )
+            weights_only = False
+
+    if weights_only:
+        if pickle_module is not None:
+            raise RuntimeError(
+                "Can not safely load weights when explicit pickle_module is specified"
+            )
+    else:
+        if pickle_module is None:
+            pickle_module = pickle
+
+    # make flipping default BC-compatible
+    if mmap is None:
+        mmap = False
+
+    _check_dill_version(pickle_module)
+
+    if "encoding" not in pickle_load_args.keys():
+        pickle_load_args["encoding"] = "utf-8"
+
+    with _open_file_like(f, "rb") as opened_file:
+        if _is_zipfile(opened_file):
+            # The zipfile reader is going to advance the current file position.
+            # If we want to actually tail call to torch.jit.load, we need to
+            # reset back to the original position.
+            orig_position = opened_file.tell()
+            overall_storage = None
+            with _open_zipfile_reader(opened_file) as opened_zipfile:
+                if _is_torchscript_zip(opened_zipfile):
+                    warnings.warn(
+                        "'torch.load' received a zip file that looks like a TorchScript archive"
+                        " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to"
+                        " silence this warning)",
+                        UserWarning,
+                    )
+                    if weights_only:
+                        raise RuntimeError(
+                            "Cannot use ``weights_only=True`` with TorchScript archives passed to "
+                            "``torch.load``. " + UNSAFE_MESSAGE
+                        )
+                    opened_file.seek(orig_position)
+                    return torch.jit.load(opened_file, map_location=map_location)
+                if mmap:
+                    if not _is_path(f):
+                        raise ValueError(
+                            "f must be a file path in order to use the mmap argument"
+                        )
+                    size = os.path.getsize(f)
+                    if not IS_WINDOWS:
+                        shared = get_default_mmap_options() == MAP_SHARED
+                    else:
+                        shared = False
+                    overall_storage = torch.UntypedStorage.from_file(
+                        os.fspath(f), shared, size
+                    )
+                if weights_only:
+                    try:
+                        return _load(
+                            opened_zipfile,
+                            map_location,
+                            _weights_only_unpickler,
+                            overall_storage=overall_storage,
+                            **pickle_load_args,
+                        )
+                    except pickle.UnpicklingError as e:
+                        raise pickle.UnpicklingError(_get_wo_message(str(e))) from None
+                return _load(
+                    opened_zipfile,
+                    map_location,
+                    pickle_module,
+                    overall_storage=overall_storage,
+                    **pickle_load_args,
+                )
+        if mmap:
+            f_name = "" if not isinstance(f, str) else f"{f}, "
+            raise RuntimeError(
+                "mmap can only be used with files saved with "
+                f"`torch.save({f_name}_use_new_zipfile_serialization=True), "
+                "please torch.save your checkpoint with this option in order to use mmap."
+            )
+        if weights_only:
+            try:
+                return _legacy_load(
+                    opened_file,
+                    map_location,
+                    _weights_only_unpickler,
+                    **pickle_load_args,
+                )
+            except pickle.UnpicklingError as e:
+                raise pickle.UnpicklingError(_get_wo_message(str(e))) from None
+        return _legacy_load(
+            opened_file, map_location, pickle_module, **pickle_load_args
+        )
+
+
+# Register pickling support for layout instances such as
+# torch.sparse_coo, etc
+def _get_layout(name):
+    """Get layout extension object from its string representation."""
+    cache = _get_layout.cache  # type: ignore[attr-defined]
+    if not cache:
+        for v in torch.__dict__.values():
+            if isinstance(v, torch.layout):
+                cache[str(v)] = v
+    return cache[name]
+
+
+# There are yet not good way to type annotate function attributes https://github.com/python/mypy/issues/2087
+_get_layout.cache = {}  # type: ignore[attr-defined]
+copyreg.pickle(torch.layout, lambda obj: (_get_layout, (str(obj),)))
+
+
+def _legacy_load(f, map_location, pickle_module, **pickle_load_args):
+    deserialized_objects: Dict[int, Any] = {}
+
+    restore_location = _get_restore_location(map_location)
+
+    class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
+        def find_class(self, mod_name, name):
+            if type(name) is str and "Storage" in name:
+                try:
+                    return StorageType(name)
+                except KeyError:
+                    pass
+            return super().find_class(mod_name, name)
+
+    def _check_container_source(container_type, source_file, original_source):
+        try:
+            current_source = "".join(get_source_lines_and_file(container_type)[0])
+        except Exception:  # saving the source is optional, so we can ignore any errors
+            warnings.warn(
+                "Couldn't retrieve source code for container of "
+                "type " + container_type.__name__ + ". It won't be checked "
+                "for correctness upon loading."
+            )
+            return
+        if original_source != current_source:
+            if container_type.dump_patches:
+                file_name = container_type.__name__ + ".patch"
+                diff = difflib.unified_diff(
+                    current_source.split("\n"),
+                    original_source.split("\n"),
+                    source_file,
+                    source_file,
+                    lineterm="",
+                )
+                lines = "\n".join(diff)
+                try:
+                    with open(file_name, "a+") as f:
+                        file_size = f.seek(0, 2)
+                        f.seek(0)
+                        if file_size == 0:
+                            f.write(lines)
+                        elif file_size != len(lines) or f.read() != lines:
+                            raise OSError
+                    msg = (
+                        "Saved a reverse patch to " + file_name + ". "
+                        "Run `patch -p0 < " + file_name + "` to revert your "
+                        "changes."
+                    )
+                except OSError:
+                    msg = (
+                        "Tried to save a patch, but couldn't create a "
+                        "writable file " + file_name + ". Make sure it "
+                        "doesn't exist and your working directory is "
+                        "writable."
+                    )
+            else:
+                msg = (
+                    "you can retrieve the original source code by "
+                    "accessing the object's source attribute or set "
+                    "`torch.nn.Module.dump_patches = True` and use the "
+                    "patch tool to revert the changes."
+                )
+            msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}"
+            warnings.warn(msg, SourceChangeWarning)
+
+    def legacy_load(f):
+        deserialized_objects: Dict[int, Any] = {}
+
+        def persistent_load(saved_id):
+            if isinstance(saved_id, tuple):
+                # Ignore containers that don't have any sources saved
+                if all(saved_id[1:]):
+                    _check_container_source(*saved_id)
+                return saved_id[0]
+            return deserialized_objects[int(saved_id)]
+
+        with closing(
+            tarfile.open(fileobj=f, mode="r:", format=tarfile.PAX_FORMAT)
+        ) as tar, mkdtemp() as tmpdir:
+            if pickle_module is _weights_only_unpickler:
+                raise RuntimeError(
+                    "Cannot use ``weights_only=True`` with files saved in the "
+                    "legacy .tar format. " + UNSAFE_MESSAGE
+                )
+            tar.extract("storages", path=tmpdir)
+            with open(os.path.join(tmpdir, "storages"), "rb", 0) as f:
+                num_storages = pickle_module.load(f, **pickle_load_args)
+                for _ in range(num_storages):
+                    args = pickle_module.load(f, **pickle_load_args)
+                    key, location, storage_type = args
+                    dtype = storage_type._dtype
+                    obj = cast(Storage, torch.UntypedStorage)._new_with_file(
+                        f, torch._utils._element_size(dtype)
+                    )
+                    obj = restore_location(obj, location)
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[key] = torch.storage.TypedStorage(
+                        wrap_storage=obj, dtype=dtype, _internal=True
+                    )
+
+                storage_views = pickle_module.load(f, **pickle_load_args)
+                for target_cdata, root_cdata, offset, numel in storage_views:
+                    root = deserialized_objects[root_cdata]
+                    element_size = torch._utils._element_size(root.dtype)
+                    offset_bytes = offset * element_size
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[target_cdata] = torch.storage.TypedStorage(
+                        wrap_storage=root._untyped_storage[
+                            offset_bytes : offset_bytes + numel * element_size
+                        ],
+                        dtype=root.dtype,
+                        _internal=True,
+                    )
+
+            tar.extract("tensors", path=tmpdir)
+            with open(os.path.join(tmpdir, "tensors"), "rb", 0) as f:
+                num_tensors = pickle_module.load(f, **pickle_load_args)
+                for _ in range(num_tensors):
+                    args = pickle_module.load(f, **pickle_load_args)
+                    key, storage_id, _original_tensor_type = args
+                    storage = deserialized_objects[storage_id]
+                    (ndim,) = struct.unpack("<i", f.read(4))
+                    # skip next 4 bytes; legacy encoding treated ndim as 8 bytes
+                    f.read(4)
+                    numel = struct.unpack(f"<{ndim}q", f.read(8 * ndim))
+                    stride = struct.unpack(f"<{ndim}q", f.read(8 * ndim))
+                    (storage_offset,) = struct.unpack("<q", f.read(8))
+                    tensor = torch.empty((0,), dtype=storage.dtype).set_(
+                        storage._untyped_storage, storage_offset, numel, stride
+                    )
+                    deserialized_objects[key] = tensor
+
+            pickle_file = tar.extractfile("pickle")
+            unpickler = UnpicklerWrapper(pickle_file, **pickle_load_args)
+            unpickler.persistent_load = persistent_load
+            result = unpickler.load()
+            return result
+
+    deserialized_objects = {}
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        if typename == "module":
+            # Ignore containers that don't have any sources saved
+            if all(data[1:]):
+                _check_container_source(*data)
+            return data[0]
+        elif typename == "storage":
+            storage_type, root_key, location, numel, view_metadata = data
+            location = _maybe_decode_ascii(location)
+            dtype = storage_type.dtype
+
+            nbytes = numel * torch._utils._element_size(dtype)
+
+            if root_key not in deserialized_objects:
+                if torch._guards.active_fake_mode() is not None:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes, device="meta"))
+                else:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes))
+                    obj._torch_load_uninitialized = True
+                    obj = restore_location(obj, location)
+                # TODO: Once we decide to break serialization FC, we can
+                # stop wrapping with TypedStorage
+                typed_storage = torch.storage.TypedStorage(
+                    wrap_storage=obj, dtype=dtype, _internal=True
+                )
+                deserialized_objects[root_key] = typed_storage
+            else:
+                typed_storage = deserialized_objects[root_key]
+                if typed_storage._data_ptr() == 0:
+                    typed_storage = torch.storage.TypedStorage(
+                        device=typed_storage._untyped_storage.device,
+                        dtype=dtype,
+                        _internal=True,
+                    )
+
+            if view_metadata is not None:
+                view_key, offset, view_size = view_metadata
+                offset_bytes = offset * torch._utils._element_size(dtype)
+                view_size_bytes = view_size * torch._utils._element_size(dtype)
+                if view_key not in deserialized_objects:
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[view_key] = torch.storage.TypedStorage(
+                        wrap_storage=typed_storage._untyped_storage[
+                            offset_bytes : offset_bytes + view_size_bytes
+                        ],
+                        dtype=dtype,
+                        _internal=True,
+                    )
+                res = deserialized_objects[view_key]
+
+            else:
+                res = typed_storage
+            return res
+        else:
+            raise RuntimeError(f"Unknown saved id type: {saved_id[0]}")
+
+    _check_seekable(f)
+    f_should_read_directly = _should_read_directly(f)
+
+    if f_should_read_directly and f.tell() == 0:
+        # legacy_load requires that f has fileno()
+        # only if offset is zero we can attempt the legacy tar file loader
+        try:
+            return legacy_load(f)
+        except tarfile.TarError:
+            if _is_zipfile(f):
+                # .zip is used for torch.jit.save and will throw an un-pickling error here
+                raise RuntimeError(
+                    f"{f.name} is a zip archive (did you mean to use torch.jit.load()?)"
+                ) from None
+            # if not a tarfile, reset file offset and proceed
+            f.seek(0)
+
+    if not hasattr(f, "readinto") and (3, 8, 0) <= sys.version_info < (3, 8, 2):
+        raise RuntimeError(
+            "torch.load does not work with file-like objects that do not implement readinto on Python 3.8.0 and 3.8.1. "
+            f'Received object of type "{type(f)}". Please update to Python 3.8.2 or newer to restore this '
+            "functionality."
+        )
+
+    magic_number = pickle_module.load(f, **pickle_load_args)
+    if magic_number != MAGIC_NUMBER:
+        raise RuntimeError("Invalid magic number; corrupt file?")
+    protocol_version = pickle_module.load(f, **pickle_load_args)
+    if protocol_version != PROTOCOL_VERSION:
+        raise RuntimeError(f"Invalid protocol version: {protocol_version}")
+
+    _sys_info = pickle_module.load(f, **pickle_load_args)
+    unpickler = UnpicklerWrapper(f, **pickle_load_args)
+    unpickler.persistent_load = persistent_load
+    result = unpickler.load()
+
+    deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)
+
+    if torch._guards.active_fake_mode() is None:
+        offset = f.tell() if f_should_read_directly else None
+        for key in deserialized_storage_keys:
+            assert key in deserialized_objects
+            typed_storage = deserialized_objects[key]
+            typed_storage._untyped_storage._set_from_file(
+                f,
+                offset,
+                f_should_read_directly,
+                torch._utils._element_size(typed_storage.dtype),
+            )
+            if offset is not None:
+                offset = f.tell()
+
+    torch._utils._validate_loaded_sparse_tensors()
+
+    return result
+
+
+def _maybe_decode_ascii(bytes_str: Union[bytes, str]) -> str:
+    # When using encoding='bytes' in Py3, some **internal** keys stored as
+    # strings in Py2 are loaded as bytes. This function decodes them with
+    # ascii encoding, one that Py3 uses by default.
+    #
+    # NOTE: This should only be used on internal keys (e.g., `typename` and
+    #       `location` in `persistent_load` below!
+    if isinstance(bytes_str, bytes):
+        return bytes_str.decode("ascii")
+    return bytes_str
+
+
+def _get_restore_location(map_location):
+    if map_location is None:
+        restore_location = default_restore_location
+    elif isinstance(map_location, dict):
+
+        def restore_location(storage, location):
+            location = map_location.get(location, location)
+            return default_restore_location(storage, location)
+
+    elif isinstance(map_location, (str, bytes)):
+
+        def restore_location(storage, location):
+            return default_restore_location(storage, map_location)
+
+    elif isinstance(map_location, torch.device):
+
+        def restore_location(storage, location):
+            return default_restore_location(storage, str(map_location))
+
+    else:
+
+        def restore_location(storage, location):
+            result = map_location(storage, location)
+            if result is None:
+                result = default_restore_location(storage, location)
+            return result
+
+    return restore_location
+
+
+class StorageType:
+    def __init__(self, name):
+        self._dtype = _get_dtype_from_pickle_storage_type(name)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def __str__(self):
+        return f"StorageType(dtype={self.dtype})"
+
+
+def _load(
+    zip_file,
+    map_location,
+    pickle_module,
+    pickle_file="data.pkl",
+    overall_storage=None,
+    **pickle_load_args,
+):
+    restore_location = _get_restore_location(map_location)
+
+    loaded_storages = {}
+
+    # check if byteswapping is needed
+    byteordername = "byteorder"
+    byteorderdata = None
+    if zip_file.has_record(byteordername):
+        byteorderdata = zip_file.get_record(byteordername)
+        if byteorderdata not in [b"little", b"big"]:
+            raise ValueError("Unknown endianness type: " + byteorderdata.decode())
+    elif (
+        get_default_load_endianness() == LoadEndianness.LITTLE
+        or get_default_load_endianness() is None
+    ):
+        byteorderdata = b"little"
+    elif get_default_load_endianness() == LoadEndianness.BIG:
+        byteorderdata = b"big"
+    elif get_default_load_endianness() == LoadEndianness.NATIVE:
+        pass
+    else:
+        raise ValueError("Invalid load endianness type")
+
+    if (
+        not zip_file.has_record(byteordername)
+        and get_default_load_endianness() is None
+        and sys.byteorder == "big"
+    ):
+        # Default behaviour was changed
+        # See https://github.com/pytorch/pytorch/issues/101688
+        warnings.warn(
+            "The default load endianness for checkpoints without a byteorder mark "
+            "on big endian machines was changed from 'native' to 'little' endian, "
+            "to avoid this behavior please use "
+            "torch.serialization.set_default_load_endianness to set "
+            "the desired default load endianness",
+            UserWarning,
+        )
+
+    def load_tensor(dtype, numel, key, location):
+        name = f"data/{key}"
+        if torch._guards.detect_fake_mode(None) is not None:
+            nbytes = numel * torch._utils._element_size(dtype)
+            storage = torch.UntypedStorage(nbytes, device="meta")
+        elif overall_storage is not None:
+            storage_offset = zip_file.get_record_offset(name)
+            storage = overall_storage[storage_offset : storage_offset + numel]
+        else:
+            storage = (
+                zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)
+                ._typed_storage()
+                ._untyped_storage
+            )
+        # swap here if byteswapping is needed
+        if byteorderdata is not None:
+            if byteorderdata.decode() != sys.byteorder:
+                storage.byteswap(dtype)
+
+        # TODO: Once we decide to break serialization FC, we can
+        # stop wrapping with TypedStorage
+        typed_storage = torch.storage.TypedStorage(
+            wrap_storage=restore_location(storage, location),
+            dtype=dtype,
+            _internal=True,
+        )
+
+        if typed_storage._data_ptr() != 0:
+            loaded_storages[key] = typed_storage
+
+        return typed_storage
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        assert (
+            typename == "storage"
+        ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
+        storage_type, key, location, numel = data
+        if storage_type is torch.UntypedStorage:
+            dtype = torch.uint8
+        else:
+            dtype = storage_type.dtype
+
+        if key in loaded_storages:
+            typed_storage = loaded_storages[key]
+        else:
+            nbytes = numel * torch._utils._element_size(dtype)
+            typed_storage = load_tensor(
+                dtype, nbytes, key, _maybe_decode_ascii(location)
+            )
+
+        return typed_storage
+
+    load_module_mapping: Dict[str, str] = {
+        # See https://github.com/pytorch/pytorch/pull/51633
+        "torch.tensor": "torch._tensor"
+    }
+
+    # Need to subclass Unpickler instead of directly monkey-patching the find_class method
+    # because it's marked readonly in pickle.
+    # The type: ignore is because mypy can't statically determine the type of this class.
+    class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
+        # from https://stackoverflow.com/questions/13398462/unpickling-python-objects-with-a-changed-module-path/13405732
+        # Lets us override the imports that pickle uses when unpickling an object.
+        # This is useful for maintaining BC if we change a module path that tensor instantiation relies on.
+        def find_class(self, mod_name, name):
+            if type(name) is str and "Storage" in name:
+                try:
+                    return StorageType(name)
+                except KeyError:
+                    pass
+            mod_name = load_module_mapping.get(mod_name, mod_name)
+            return super().find_class(mod_name, name)
+
+    # Load the data (which may in turn use `persistent_load` to load tensors)
+    data_file = io.BytesIO(zip_file.get_record(pickle_file))
+
+    unpickler = UnpicklerWrapper(data_file, **pickle_load_args)
+    unpickler.persistent_load = persistent_load
+    # Needed for tensors where storage device and rebuild tensor device are
+    # not connected (wrapper subclasses and tensors rebuilt using numpy)
+    global _serialization_tls
+    _serialization_tls.map_location = map_location
+    result = unpickler.load()
+    _serialization_tls.map_location = None
+
+    torch._utils._validate_loaded_sparse_tensors()
+    torch._C._log_api_usage_metadata(
+        "torch.load.metadata", {"serialization_id": zip_file.serialization_id()}
+    )
+    return result
+
+
+def _is_torchscript_zip(zip_file):
+    return "constants.pkl" in zip_file.get_all_records()

From ee2f08138c8881358a1b04ec7d7b57797bb5202f Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sun, 6 Apr 2025 23:21:15 -0400
Subject: [PATCH 34/40] sglang+zero_inference

---
 inference/sglang/README.md         |  1 +
 inference/sglang/local_cufile.json |  1 +
 inference/sglang/run_model.sh      | 15 +++++++++++++++
 3 files changed, 17 insertions(+)
 create mode 100644 inference/sglang/README.md
 create mode 100644 inference/sglang/local_cufile.json
 create mode 100644 inference/sglang/run_model.sh

diff --git a/inference/sglang/README.md b/inference/sglang/README.md
new file mode 100644
index 000000000..590d685f0
--- /dev/null
+++ b/inference/sglang/README.md
@@ -0,0 +1 @@
+# ZeRO-Inference SGLang examples
diff --git a/inference/sglang/local_cufile.json b/inference/sglang/local_cufile.json
new file mode 100644
index 000000000..29c51af0c
--- /dev/null
+++ b/inference/sglang/local_cufile.json
@@ -0,0 +1 @@
+{"execution": {"max_io_queue_depth": 64, "max_request_parallelism": 8, "max_io_threads": 8, "parallel_io": true, "min_io_threshold_size_kb": 8192}}
\ No newline at end of file
diff --git a/inference/sglang/run_model.sh b/inference/sglang/run_model.sh
new file mode 100644
index 000000000..3a93b09c8
--- /dev/null
+++ b/inference/sglang/run_model.sh
@@ -0,0 +1,15 @@
+export LOCAL_RANK=0
+DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
+MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct"
+BATCH_SIZE=128
+
+# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
+
+MODEL_NAME="meta-llama/Llama-3.2-1B"
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference
+# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
+
+
+MODEL_NAME="meta-llama/Meta-Llama-3.1-70B"
+# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference
+# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph

From ad81cecd0b629415ac965e196dcdb4cc328342ca Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sun, 6 Apr 2025 23:28:27 -0400
Subject: [PATCH 35/40] Remove file

---
 inference/sglang/local_cufile.json | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 inference/sglang/local_cufile.json

diff --git a/inference/sglang/local_cufile.json b/inference/sglang/local_cufile.json
deleted file mode 100644
index 29c51af0c..000000000
--- a/inference/sglang/local_cufile.json
+++ /dev/null
@@ -1 +0,0 @@
-{"execution": {"max_io_queue_depth": 64, "max_request_parallelism": 8, "max_io_threads": 8, "parallel_io": true, "min_io_threshold_size_kb": 8192}}
\ No newline at end of file

From dff5274e0a9371aece9163b82fd530bc71f6453f Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 8 Apr 2025 06:57:44 -0400
Subject: [PATCH 36/40] Add offload configs

---
 inference/sglang/ds_offload_cpu.json      | 13 +++++++++++++
 inference/sglang/ds_offload_nvme_aio.json | 23 +++++++++++++++++++++++
 inference/sglang/ds_offload_nvme_gds.json | 23 +++++++++++++++++++++++
 inference/sglang/run_model.sh             |  8 +++++---
 4 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 inference/sglang/ds_offload_cpu.json
 create mode 100644 inference/sglang/ds_offload_nvme_aio.json
 create mode 100644 inference/sglang/ds_offload_nvme_gds.json

diff --git a/inference/sglang/ds_offload_cpu.json b/inference/sglang/ds_offload_cpu.json
new file mode 100644
index 000000000..9be11bc84
--- /dev/null
+++ b/inference/sglang/ds_offload_cpu.json
@@ -0,0 +1,13 @@
+{
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": "auto",
+        "offload_param": {
+            "device": "cpu",
+            "buffer_size": "auto"
+        }
+    },
+    "train_batch_size": 1
+}
diff --git a/inference/sglang/ds_offload_nvme_aio.json b/inference/sglang/ds_offload_nvme_aio.json
new file mode 100644
index 000000000..268fbafc6
--- /dev/null
+++ b/inference/sglang/ds_offload_nvme_aio.json
@@ -0,0 +1,23 @@
+{
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": "auto",
+        "offload_param": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme/sglang",
+            "buffer_size": "auto",
+            "buffer_count": 5
+        }
+    },
+    "aio": {
+        "block_size": 8388608,
+        "queue_depth": 32,
+        "intra_op_parallelism": 8,
+        "single_submit": false,
+        "overlap_events": true,
+        "use_gds": false
+    },
+    "train_batch_size": 1
+}
diff --git a/inference/sglang/ds_offload_nvme_gds.json b/inference/sglang/ds_offload_nvme_gds.json
new file mode 100644
index 000000000..479d28479
--- /dev/null
+++ b/inference/sglang/ds_offload_nvme_gds.json
@@ -0,0 +1,23 @@
+{
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": "auto",
+        "offload_param": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme/sglang",
+            "buffer_size": "auto",
+            "buffer_count": 3
+        }
+    },
+    "aio": {
+        "block_size": 8388608,
+        "queue_depth": 32,
+        "intra_op_parallelism": 8,
+        "single_submit": false,
+        "overlap_events": true,
+        "use_gds": true
+    },
+    "train_batch_size": 1
+}
diff --git a/inference/sglang/run_model.sh b/inference/sglang/run_model.sh
index 3a93b09c8..29b3aad18 100644
--- a/inference/sglang/run_model.sh
+++ b/inference/sglang/run_model.sh
@@ -1,13 +1,15 @@
 export LOCAL_RANK=0
 DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
 MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct"
-BATCH_SIZE=128
+BATCH_SIZE=1
 
 # python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
 
 MODEL_NAME="meta-llama/Llama-3.2-1B"
-python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference
-# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
 
 
 MODEL_NAME="meta-llama/Meta-Llama-3.1-70B"

From d84bb56869b0997a32b2138ef0f94ba0539a09c7 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 8 Apr 2025 07:10:43 -0400
Subject: [PATCH 37/40] Add pin_memory

---
 inference/sglang/ds_offload_cpu.json      | 1 +
 inference/sglang/ds_offload_nvme_aio.json | 1 +
 inference/sglang/ds_offload_nvme_gds.json | 1 +
 3 files changed, 3 insertions(+)

diff --git a/inference/sglang/ds_offload_cpu.json b/inference/sglang/ds_offload_cpu.json
index 9be11bc84..1c0438014 100644
--- a/inference/sglang/ds_offload_cpu.json
+++ b/inference/sglang/ds_offload_cpu.json
@@ -6,6 +6,7 @@
         "stage3_max_live_parameters": "auto",
         "offload_param": {
             "device": "cpu",
+            "pin_memory": true,
             "buffer_size": "auto"
         }
     },
diff --git a/inference/sglang/ds_offload_nvme_aio.json b/inference/sglang/ds_offload_nvme_aio.json
index 268fbafc6..71ea89438 100644
--- a/inference/sglang/ds_offload_nvme_aio.json
+++ b/inference/sglang/ds_offload_nvme_aio.json
@@ -7,6 +7,7 @@
         "offload_param": {
             "device": "nvme",
             "nvme_path": "/local_nvme/sglang",
+            "pin_memory": true,
             "buffer_size": "auto",
             "buffer_count": 5
         }
diff --git a/inference/sglang/ds_offload_nvme_gds.json b/inference/sglang/ds_offload_nvme_gds.json
index 479d28479..7f3784741 100644
--- a/inference/sglang/ds_offload_nvme_gds.json
+++ b/inference/sglang/ds_offload_nvme_gds.json
@@ -7,6 +7,7 @@
         "offload_param": {
             "device": "nvme",
             "nvme_path": "/local_nvme/sglang",
+            "pin_memory": true,
             "buffer_size": "auto",
             "buffer_count": 3
         }

From db3b32b4df9b85d970b12c8a96f9f89d248d1c25 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 8 Apr 2025 07:22:42 -0400
Subject: [PATCH 38/40] Cleanup scripts

---
 inference/sglang/run_llama3_1B.sh                   | 11 +++++++++++
 inference/sglang/run_llama3_70B.sh                  |  9 +++++++++
 inference/sglang/{run_model.sh => run_llama3_8B.sh} | 10 +---------
 3 files changed, 21 insertions(+), 9 deletions(-)
 create mode 100644 inference/sglang/run_llama3_1B.sh
 create mode 100644 inference/sglang/run_llama3_70B.sh
 rename inference/sglang/{run_model.sh => run_llama3_8B.sh} (62%)

diff --git a/inference/sglang/run_llama3_1B.sh b/inference/sglang/run_llama3_1B.sh
new file mode 100644
index 000000000..a6a1f543f
--- /dev/null
+++ b/inference/sglang/run_llama3_1B.sh
@@ -0,0 +1,11 @@
+export LOCAL_RANK=0
+DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
+BATCH_SIZE=128
+MODEL_NAME="meta-llama/Llama-3.2-1B"
+
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
+
+
diff --git a/inference/sglang/run_llama3_70B.sh b/inference/sglang/run_llama3_70B.sh
new file mode 100644
index 000000000..6e3949551
--- /dev/null
+++ b/inference/sglang/run_llama3_70B.sh
@@ -0,0 +1,9 @@
+export LOCAL_RANK=0
+DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
+BATCH_SIZE=128
+MODEL_NAME="meta-llama/Meta-Llama-3.1-70B"
+
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
+python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
+# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
diff --git a/inference/sglang/run_model.sh b/inference/sglang/run_llama3_8B.sh
similarity index 62%
rename from inference/sglang/run_model.sh
rename to inference/sglang/run_llama3_8B.sh
index 29b3aad18..f203bdd4a 100644
--- a/inference/sglang/run_model.sh
+++ b/inference/sglang/run_llama3_8B.sh
@@ -1,17 +1,9 @@
 export LOCAL_RANK=0
 DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0"
+BATCH_SIZE=128
 MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct"
-BATCH_SIZE=1
 
-# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
-
-MODEL_NAME="meta-llama/Llama-3.2-1B"
 python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json
 python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json
 python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json
 python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph
-
-
-MODEL_NAME="meta-llama/Meta-Llama-3.1-70B"
-# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference
-# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME}  ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph

From 6ee91cb5297497ee1abdd281f58ba2036ad52faf Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sat, 12 Apr 2025 14:17:38 -0400
Subject: [PATCH 39/40] SGLang README

---
 inference/sglang/README.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/inference/sglang/README.md b/inference/sglang/README.md
index 590d685f0..f904864dd 100644
--- a/inference/sglang/README.md
+++ b/inference/sglang/README.md
@@ -1 +1,12 @@
-# ZeRO-Inference SGLang examples
+# SGLang + ZeRO-Inference Examples
+This folder contains examples of [ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) integration into [SGLang](https://github.com/sgl-project/sglang) framework. This integration enable SGLang to inference massive models (e.g., with 100s billion parameters) on a single GPU through the NVMe/CPU offloading optimizations of ZeRO-Inference. 
+
+## Prerequisites
+1. DeepSpeed version >= [0.16.6](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.16.6)
+2. SGLang: These examples require our SGLang [fork](https://github.com/tjruwase/sglang/tree/zero-inference). We plan to upstream the SGLang changes to main branch. 
+
+
+## Examples
+The examples comprise of the following:
+1. bash scripts that benchmark SGLang throughput in [offline mode](https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_offline_throughput.py) with different ZeRO-Inference offloading options. Each script runs a inference on a different model with a prompt of 512 tokens, output of 32 tokens, and batch size of 128. 
+2. DeepSpeed config files corresponding to ZeRO-Inference offloading: (i) CPU offload, (ii) NVMe offload with AIO, and (iii) NVMe offloading with NVIDIA GDS. 
\ No newline at end of file

From e283b7429f407f0d74ee50aed2ac122a82cf93db Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Sat, 12 Apr 2025 14:20:06 -0400
Subject: [PATCH 40/40] Remove file

---
 deepnvme/model_checkpoint/local_cufile.json | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 deepnvme/model_checkpoint/local_cufile.json

diff --git a/deepnvme/model_checkpoint/local_cufile.json b/deepnvme/model_checkpoint/local_cufile.json
deleted file mode 100644
index 7d4d9c8e3..000000000
--- a/deepnvme/model_checkpoint/local_cufile.json
+++ /dev/null
@@ -1 +0,0 @@
-{"execution": {"max_io_queue_depth": 8, "max_request_parallelism": 1, "max_io_threads": 1, "parallel_io": true, "min_io_threshold_size_kb": 8192}}
\ No newline at end of file