From 8106bb8383ba9c635078c91983c27e554f246d6d Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Thu, 30 Dec 2021 10:35:27 -0800 Subject: [PATCH 01/40] Fast model checkpointing --- .../torch_serialize_save_model.py | 128 ++++++++++++++++++ .../torch_serialize_save_tensor.py | 113 ++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 fast_io/model_checkpoint/torch_serialize_save_model.py create mode 100644 fast_io/model_checkpoint/torch_serialize_save_tensor.py diff --git a/fast_io/model_checkpoint/torch_serialize_save_model.py b/fast_io/model_checkpoint/torch_serialize_save_model.py new file mode 100644 index 000000000..03d924b1b --- /dev/null +++ b/fast_io/model_checkpoint/torch_serialize_save_model.py @@ -0,0 +1,128 @@ +import time +import argparse +import torch +import os +from transformers import AutoModelForCausalLM +from transformers import T5ForConditionalGeneration +import deepspeed +from deepspeed.ops.aio import AsyncIOBuilder + + +AIO_QUEUE_DEPTH = 8 +AIO_BLOCK_SIZE = 8*(1024**2) +AIO_THREAD_COUNT = 1 +AIO_SINGLE_SUBMIT = False +AIO_OVERLAP_EVENTS = False +PINNED_BUFFER_MB = 64 + +def _get_model(big_model): + if big_model: + model_name="EleutherAI/gpt-j-6B" + model = AutoModelForCausalLM.from_pretrained(model_name).half()#.cuda() + ckpt_name="gpt-j-6B" + else: + model_name="hf-internal-testing/tiny-random-t5" # "patrickvonplaten/t5-tiny-random" # "t5-small" + model = T5ForConditionalGeneration.from_pretrained(model_name).half() + ckpt_name="t5-small" + + return model, model_name, ckpt_name + + +def _get_aio_handle(): + h = AsyncIOBuilder().load().aio_handle( + block_size=AIO_BLOCK_SIZE, + queue_depth=AIO_QUEUE_DEPTH, + single_submit=AIO_SINGLE_SUBMIT, + overlap_events=AIO_SINGLE_SUBMIT, + num_threads=AIO_THREAD_COUNT) + return h + +def test_save(file, buffer, use_zipfile): + st = time.time() + torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + return time.time() - st + + +def test_ds_mock_save(file, buffer, use_zipfile): + from deepspeed.io import MockFileWriter + st = time.time() + dsmw = MockFileWriter(file) + torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + write_sec = time.time() - st + dsmw._dump_state() + return write_sec + +def test_ds_py_save(file, buffer, use_zipfile): + from deepspeed.io import PyFileWriter + st = time.time() + dspw = PyFileWriter(file) + torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + write_sec = time.time() - st + dspw._dump_state() + return write_sec + +def test_ds_aio_save(file, buffer, use_zipfile): + h = _get_aio_handle() + pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() + from deepspeed.io import DeepSpeedFileWriter as dsfw + st = time.time() + dsfw = dsfw( + file_path=file, + aio_handle=h, + pinned_tensor=pinned_memory) + torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True) + write_sec = time.time() - st + dsfw._dump_state() + return write_sec + +def run(model, model_name, ckpt_name, folder): + print(f'Model name = {model_name}') + fn_dict = { + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, + 'test_ds_py_save': test_ds_py_save, + 'test_ds_aio_save':test_ds_aio_save + } + for tag, fn in fn_dict.items(): + file = os.path.join(folder, f'{tag}_{ckpt_name}.pt') + print(f'checkpoint file = {file}') + if os.path.isfile(file): + os.remove(file) + st = time.time() + write_sec = fn(file, model, True) + ckpt_size = os.path.getsize(file) + gb_size = ckpt_size/(1024**3) + gb_per_sec = gb_size/write_sec + print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s') + print(f'*********************************************') + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--folder', + default=None, + type=str, + required=True, + help='Folder to use for I/O.') + parser.add_argument('--big_model', + action='store_true', + help='Use EleutherAI/gpt-j-6B for checkpointing.') + args = parser.parse_args() + print(f'args = {args}') + return args + + +def main(): + print(f'Performance test of deepspeed fast model checkpoint') + print(f'torch version = {torch.__version__}') + torch.manual_seed(42) + args = parse_arguments() + if not os.path.exists(args.folder): + print(f'Invalid folder: {args.folder}') + quit() + model, model_name, ckpt_name = _get_model(args.big_model) + + run(model, model_name, ckpt_name, args.folder) + + +if __name__ == "__main__": + main() diff --git a/fast_io/model_checkpoint/torch_serialize_save_tensor.py b/fast_io/model_checkpoint/torch_serialize_save_tensor.py new file mode 100644 index 000000000..85f99eddf --- /dev/null +++ b/fast_io/model_checkpoint/torch_serialize_save_tensor.py @@ -0,0 +1,113 @@ +import time +import argparse +import torch +import os +import deepspeed +from deepspeed.ops.aio import AsyncIOBuilder + + +AIO_QUEUE_DEPTH = 8 +AIO_BLOCK_SIZE = 8*(1024**2) +AIO_THREAD_COUNT = 1 +AIO_SINGLE_SUBMIT = False +AIO_OVERLAP_EVENTS = False +PINNED_BUFFER_MB = 64 + + +def _get_aio_handle(): + h = AsyncIOBuilder().load().aio_handle( + block_size=AIO_BLOCK_SIZE, + queue_depth=AIO_QUEUE_DEPTH, + single_submit=AIO_SINGLE_SUBMIT, + overlap_events=AIO_SINGLE_SUBMIT, + num_threads=AIO_THREAD_COUNT) + return h + +def test_save(file, buffer, use_zipfile): + st = time.time() + torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + return time.time() - st + + +def test_ds_mock_save(file, buffer, use_zipfile): + from deepspeed.io import MockFileWriter + st = time.time() + dsmw = MockFileWriter(file) + torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + write_sec = time.time() - st + dsmw._dump_state() + return write_sec + +def test_ds_py_save(file, buffer, use_zipfile): + from deepspeed.io import PyFileWriter + st = time.time() + dspw = PyFileWriter(file) + torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + write_sec = time.time() - st + dspw._dump_state() + return write_sec + +def test_ds_aio_save(file, buffer, use_zipfile): + h = _get_aio_handle() + pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() + from deepspeed.io import DeepSpeedFileWriter as dsfw + st = time.time() + dsfw = dsfw( + file_path=file, + aio_handle=h, + pinned_tensor=pinned_memory) + torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True) + write_sec = time.time() - st + dsfw._dump_state() + return write_sec + +def run(mb_size, folder): + buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu').pin_memory() + + fn_dict = { + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, + 'test_ds_py_save': test_ds_py_save, + 'test_ds_aio_save':test_ds_aio_save + } + for tag, fn in fn_dict.items(): + file = os.path.join(folder, f'{tag}_{mb_size}MB.pt') + print(f'checkpoint file = {file}') + if os.path.isfile(file): + os.remove(file) + st = time.time() + write_sec = fn(file, buffer, True) + gb_per_sec = mb_size/(1024.0*write_sec) + gb_size = os.path.getsize(file)/(1024**3) + print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s') + print(f'*********************************************') + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--folder', + default=None, + type=str, + required=True, + help='Folder to use for I/O.') + parser.add_argument('--mb_size', + type=int, + default=None, + required=True, + help='Size of tensor to save in MB.') + args = parser.parse_args() + print(f'args = {args}') + return args + + + +def main(): + print(f'Performance test of deepspeed fast checkpoint') + args = parse_arguments() + if not os.path.exists(args.folder): + print(f'Invalid folder: {args.folder}') + quit() + run(args.mb_size, args.folder) + + +if __name__ == "__main__": + main() From 761e4e54d6bb317d0068e66347d70bffaa7f01cf Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Fri, 31 Dec 2021 10:42:54 -0800 Subject: [PATCH 02/40] Support both legacy and serialized formats --- ...h_serialize_save_model.py => torch_save_model.py} | 12 ++++++++---- ...serialize_save_tensor.py => torch_save_tensor.py} | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) rename fast_io/model_checkpoint/{torch_serialize_save_model.py => torch_save_model.py} (92%) rename fast_io/model_checkpoint/{torch_serialize_save_tensor.py => torch_save_tensor.py} (92%) diff --git a/fast_io/model_checkpoint/torch_serialize_save_model.py b/fast_io/model_checkpoint/torch_save_model.py similarity index 92% rename from fast_io/model_checkpoint/torch_serialize_save_model.py rename to fast_io/model_checkpoint/torch_save_model.py index 03d924b1b..0b0a5d4d2 100644 --- a/fast_io/model_checkpoint/torch_serialize_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -70,12 +70,12 @@ def test_ds_aio_save(file, buffer, use_zipfile): file_path=file, aio_handle=h, pinned_tensor=pinned_memory) - torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True) + torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) write_sec = time.time() - st dsfw._dump_state() return write_sec -def run(model, model_name, ckpt_name, folder): +def run(model, model_name, ckpt_name, folder, legacy_save): print(f'Model name = {model_name}') fn_dict = { 'test_save': test_save, @@ -89,7 +89,7 @@ def run(model, model_name, ckpt_name, folder): if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, model, True) + write_sec = fn(file, model, not legacy_save) ckpt_size = os.path.getsize(file) gb_size = ckpt_size/(1024**3) gb_per_sec = gb_size/write_sec @@ -106,6 +106,10 @@ def parse_arguments(): parser.add_argument('--big_model', action='store_true', help='Use EleutherAI/gpt-j-6B for checkpointing.') + parser.add_argument('--legacy', + action='store_true', + help='Use torch legacy save format') + args = parser.parse_args() print(f'args = {args}') return args @@ -121,7 +125,7 @@ def main(): quit() model, model_name, ckpt_name = _get_model(args.big_model) - run(model, model_name, ckpt_name, args.folder) + run(model, model_name, ckpt_name, args.folder, args.legacy) if __name__ == "__main__": diff --git a/fast_io/model_checkpoint/torch_serialize_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py similarity index 92% rename from fast_io/model_checkpoint/torch_serialize_save_tensor.py rename to fast_io/model_checkpoint/torch_save_tensor.py index 85f99eddf..cc601bb3e 100644 --- a/fast_io/model_checkpoint/torch_serialize_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -56,12 +56,12 @@ def test_ds_aio_save(file, buffer, use_zipfile): file_path=file, aio_handle=h, pinned_tensor=pinned_memory) - torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=True) + torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) write_sec = time.time() - st dsfw._dump_state() return write_sec -def run(mb_size, folder): +def run(mb_size, folder, legacy_save): buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu').pin_memory() fn_dict = { @@ -76,7 +76,7 @@ def run(mb_size, folder): if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, buffer, True) + write_sec = fn(file, buffer, not legacy_save) gb_per_sec = mb_size/(1024.0*write_sec) gb_size = os.path.getsize(file)/(1024**3) print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s') @@ -94,6 +94,10 @@ def parse_arguments(): default=None, required=True, help='Size of tensor to save in MB.') + parser.add_argument('--legacy', + action='store_true', + help='Use torch legacy save format') + args = parser.parse_args() print(f'args = {args}') return args @@ -106,7 +110,7 @@ def main(): if not os.path.exists(args.folder): print(f'Invalid folder: {args.folder}') quit() - run(args.mb_size, args.folder) + run(args.mb_size, args.folder, args.legacy) if __name__ == "__main__": From 5967c7986e9e9ccb05e75d3fdb53110137755933 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 3 Jan 2022 12:19:59 +0000 Subject: [PATCH 03/40] Add io_buffer_mb option --- fast_io/model_checkpoint/torch_save_model.py | 22 +++++++++++------- fast_io/model_checkpoint/torch_save_tensor.py | 23 +++++++++++-------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index 0b0a5d4d2..150a0139c 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -37,13 +37,13 @@ def _get_aio_handle(): num_threads=AIO_THREAD_COUNT) return h -def test_save(file, buffer, use_zipfile): +def test_save(file, buffer, use_zipfile, io_buffer_mb): st = time.time() torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) return time.time() - st -def test_ds_mock_save(file, buffer, use_zipfile): +def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): from deepspeed.io import MockFileWriter st = time.time() dsmw = MockFileWriter(file) @@ -52,7 +52,7 @@ def test_ds_mock_save(file, buffer, use_zipfile): dsmw._dump_state() return write_sec -def test_ds_py_save(file, buffer, use_zipfile): +def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): from deepspeed.io import PyFileWriter st = time.time() dspw = PyFileWriter(file) @@ -61,9 +61,9 @@ def test_ds_py_save(file, buffer, use_zipfile): dspw._dump_state() return write_sec -def test_ds_aio_save(file, buffer, use_zipfile): +def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb): h = _get_aio_handle() - pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() + pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() from deepspeed.io import DeepSpeedFileWriter as dsfw st = time.time() dsfw = dsfw( @@ -75,7 +75,7 @@ def test_ds_aio_save(file, buffer, use_zipfile): dsfw._dump_state() return write_sec -def run(model, model_name, ckpt_name, folder, legacy_save): +def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): print(f'Model name = {model_name}') fn_dict = { 'test_save': test_save, @@ -89,7 +89,7 @@ def run(model, model_name, ckpt_name, folder, legacy_save): if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, model, not legacy_save) + write_sec = fn(file, model, not legacy_save, io_buffer_mb) ckpt_size = os.path.getsize(file) gb_size = ckpt_size/(1024**3) gb_per_sec = gb_size/write_sec @@ -109,6 +109,12 @@ def parse_arguments(): parser.add_argument('--legacy', action='store_true', help='Use torch legacy save format') + + parser.add_argument('--io_buffer_mb', + type=int, + default=PINNED_BUFFER_MB, + required=True, + help='Size of pinned i/o buffer in MB.') args = parser.parse_args() print(f'args = {args}') @@ -125,7 +131,7 @@ def main(): quit() model, model_name, ckpt_name = _get_model(args.big_model) - run(model, model_name, ckpt_name, args.folder, args.legacy) + run(model, model_name, ckpt_name, args.folder, args.legacy, args.io_buffer_mb) if __name__ == "__main__": diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index cc601bb3e..80ac99197 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -13,7 +13,6 @@ AIO_OVERLAP_EVENTS = False PINNED_BUFFER_MB = 64 - def _get_aio_handle(): h = AsyncIOBuilder().load().aio_handle( block_size=AIO_BLOCK_SIZE, @@ -23,7 +22,7 @@ def _get_aio_handle(): num_threads=AIO_THREAD_COUNT) return h -def test_save(file, buffer, use_zipfile): +def test_save(file, buffer, use_zipfile, io_buffer_mb): st = time.time() torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) return time.time() - st @@ -38,7 +37,7 @@ def test_ds_mock_save(file, buffer, use_zipfile): dsmw._dump_state() return write_sec -def test_ds_py_save(file, buffer, use_zipfile): +def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): from deepspeed.io import PyFileWriter st = time.time() dspw = PyFileWriter(file) @@ -47,9 +46,9 @@ def test_ds_py_save(file, buffer, use_zipfile): dspw._dump_state() return write_sec -def test_ds_aio_save(file, buffer, use_zipfile): +def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb): h = _get_aio_handle() - pinned_memory = torch.zeros(PINNED_BUFFER_MB*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() + pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() from deepspeed.io import DeepSpeedFileWriter as dsfw st = time.time() dsfw = dsfw( @@ -61,8 +60,8 @@ def test_ds_aio_save(file, buffer, use_zipfile): dsfw._dump_state() return write_sec -def run(mb_size, folder, legacy_save): - buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu').pin_memory() +def run(mb_size, folder, legacy_save, io_buffer_mb): + buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu') # .pin_memory() fn_dict = { 'test_save': test_save, @@ -76,7 +75,7 @@ def run(mb_size, folder, legacy_save): if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, buffer, not legacy_save) + write_sec = fn(file, buffer, not legacy_save, io_buffer_mb) gb_per_sec = mb_size/(1024.0*write_sec) gb_size = os.path.getsize(file)/(1024**3) print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s') @@ -98,6 +97,12 @@ def parse_arguments(): action='store_true', help='Use torch legacy save format') + parser.add_argument('--io_buffer_mb', + type=int, + default=PINNED_BUFFER_MB, + required=True, + help='Size of pinned i/o buffer in MB.') + args = parser.parse_args() print(f'args = {args}') return args @@ -110,7 +115,7 @@ def main(): if not os.path.exists(args.folder): print(f'Invalid folder: {args.folder}') quit() - run(args.mb_size, args.folder, args.legacy) + run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb) if __name__ == "__main__": From d96f1f6136603a492d700b043870f82518f2ed7e Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 3 Jan 2022 12:43:54 +0000 Subject: [PATCH 04/40] Bug fix --- fast_io/model_checkpoint/torch_save_tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 80ac99197..312cf31b8 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -28,7 +28,7 @@ def test_save(file, buffer, use_zipfile, io_buffer_mb): return time.time() - st -def test_ds_mock_save(file, buffer, use_zipfile): +def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): from deepspeed.io import MockFileWriter st = time.time() dsmw = MockFileWriter(file) From bbd96f2fa31157a52777a5c3fc078f60580e5513 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 3 Jan 2022 13:18:34 +0000 Subject: [PATCH 05/40] Force flush --- fast_io/model_checkpoint/torch_save_model.py | 1 + fast_io/model_checkpoint/torch_save_tensor.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index 150a0139c..a14a933a9 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -71,6 +71,7 @@ def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb): aio_handle=h, pinned_tensor=pinned_memory) torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + dsfw.close() # Force flush to storage write_sec = time.time() - st dsfw._dump_state() return write_sec diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 312cf31b8..20a9ea75b 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -56,12 +56,13 @@ def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb): aio_handle=h, pinned_tensor=pinned_memory) torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + dsfw.close() # Force flush to storage write_sec = time.time() - st dsfw._dump_state() return write_sec def run(mb_size, folder, legacy_save, io_buffer_mb): - buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu') # .pin_memory() + buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu') fn_dict = { 'test_save': test_save, From 3a161270475044d1d0ba5af10e7b4381063288ff Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 4 Jan 2022 23:04:49 +0000 Subject: [PATCH 06/40] More model options; Refactor common codes --- fast_io/model_checkpoint/torch_save_model.py | 170 ++++++++++-------- fast_io/model_checkpoint/torch_save_tensor.py | 65 +------ fast_io/model_checkpoint/torch_save_utils.py | 61 +++++++ 3 files changed, 157 insertions(+), 139 deletions(-) create mode 100644 fast_io/model_checkpoint/torch_save_utils.py diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index a14a933a9..04795fa4d 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -1,80 +1,44 @@ import time import argparse import torch +from torch.optim import Adam import os from transformers import AutoModelForCausalLM from transformers import T5ForConditionalGeneration -import deepspeed -from deepspeed.ops.aio import AsyncIOBuilder - - -AIO_QUEUE_DEPTH = 8 -AIO_BLOCK_SIZE = 8*(1024**2) -AIO_THREAD_COUNT = 1 -AIO_SINGLE_SUBMIT = False -AIO_OVERLAP_EVENTS = False -PINNED_BUFFER_MB = 64 - -def _get_model(big_model): - if big_model: - model_name="EleutherAI/gpt-j-6B" - model = AutoModelForCausalLM.from_pretrained(model_name).half()#.cuda() - ckpt_name="gpt-j-6B" - else: - model_name="hf-internal-testing/tiny-random-t5" # "patrickvonplaten/t5-tiny-random" # "t5-small" - model = T5ForConditionalGeneration.from_pretrained(model_name).half() - ckpt_name="t5-small" - +from torch_save_utils import PINNED_BUFFER_MB +from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save + + +def _get_gpt_j_6B(tag): + model_name="EleutherAI/gpt-j-6B" + model = AutoModelForCausalLM.from_pretrained(model_name)#.half() + ckpt_name="gpt-j-6B" return model, model_name, ckpt_name +def _get_tiny_t5(tag): + model_name="hf-internal-testing/tiny-random-t5" + model = T5ForConditionalGeneration.from_pretrained(model_name)#.half() + ckpt_name="tiny-random-t5" + return model, model_name, ckpt_name + + +def _get_hf_gpt2(tag): + model_name = tag + model = AutoModelForCausalLM.from_pretrained(tag) + ckpt_name = tag + return model, model_name, ckpt_name + + +HF_MODELS = { + 'tiny-random-t5': _get_tiny_t5, + 'gpt-j-6B': _get_gpt_j_6B, + 'gpt2': _get_hf_gpt2, + 'gpt2-large': _get_hf_gpt2, + 'gpt2-xl': _get_hf_gpt2, +} -def _get_aio_handle(): - h = AsyncIOBuilder().load().aio_handle( - block_size=AIO_BLOCK_SIZE, - queue_depth=AIO_QUEUE_DEPTH, - single_submit=AIO_SINGLE_SUBMIT, - overlap_events=AIO_SINGLE_SUBMIT, - num_threads=AIO_THREAD_COUNT) - return h - -def test_save(file, buffer, use_zipfile, io_buffer_mb): - st = time.time() - torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - return time.time() - st - - -def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): - from deepspeed.io import MockFileWriter - st = time.time() - dsmw = MockFileWriter(file) - torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - write_sec = time.time() - st - dsmw._dump_state() - return write_sec - -def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): - from deepspeed.io import PyFileWriter - st = time.time() - dspw = PyFileWriter(file) - torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - write_sec = time.time() - st - dspw._dump_state() - return write_sec - -def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb): - h = _get_aio_handle() - pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() - from deepspeed.io import DeepSpeedFileWriter as dsfw - st = time.time() - dsfw = dsfw( - file_path=file, - aio_handle=h, - pinned_tensor=pinned_memory) - torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - dsfw.close() # Force flush to storage - write_sec = time.time() - st - dsfw._dump_state() - return write_sec +def _get_model(model_tag): + return HF_MODELS[model_tag](model_tag) def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): print(f'Model name = {model_name}') @@ -82,7 +46,7 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): 'test_save': test_save, 'test_ds_mock_save': test_ds_mock_save, 'test_ds_py_save': test_ds_py_save, - 'test_ds_aio_save':test_ds_aio_save + 'test_ds_fast_save':test_ds_fast_save } for tag, fn in fn_dict.items(): file = os.path.join(folder, f'{tag}_{ckpt_name}.pt') @@ -97,6 +61,20 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s') print(f'*********************************************') + +def _get_initialized_optimizer(model, fused_opt): + base_optimizer = Adam(model.parameters()) + import deepspeed + if fused_opt: + from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper + else: + from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper + optimizer = FP16_Wrapper(base_optimizer) + for p in model.parameters(): + p.grad = torch.zeros_like(p) + optimizer.step() + return optimizer + def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--folder', @@ -104,17 +82,29 @@ def parse_arguments(): type=str, required=True, help='Folder to use for I/O.') - parser.add_argument('--big_model', - action='store_true', - help='Use EleutherAI/gpt-j-6B for checkpointing.') + + parser.add_argument('--model', + default=None, + type=str, + required=True, + help='Hugging Face transformers tag of model (e.g., gpt2).') + parser.add_argument('--legacy', action='store_true', help='Use torch legacy save format') + parser.add_argument('--optimizer', + action='store_true', + help='Include optimizer state in checkpoint.') + + + parser.add_argument('--fused', + action='store_true', + help='Use fused fp16 optimizer.') + parser.add_argument('--io_buffer_mb', type=int, default=PINNED_BUFFER_MB, - required=True, help='Size of pinned i/o buffer in MB.') args = parser.parse_args() @@ -122,17 +112,41 @@ def parse_arguments(): return args +def validate_arguments(args): + success = True + if not os.path.exists(args.folder): + print(f'Invalid folder: {args.folder}') + success = False + + if not args.model in HF_MODELS: + print(f'{args.model} is not a supported HF model tag') + success = False + + return success + def main(): print(f'Performance test of deepspeed fast model checkpoint') print(f'torch version = {torch.__version__}') torch.manual_seed(42) + args = parse_arguments() - if not os.path.exists(args.folder): - print(f'Invalid folder: {args.folder}') + if not validate_arguments(args): quit() - model, model_name, ckpt_name = _get_model(args.big_model) - - run(model, model_name, ckpt_name, args.folder, args.legacy, args.io_buffer_mb) + + model, model_name, ckpt_name = _get_model(args.model) + if args.optimizer: + model = model.half().cuda() + optimizer = _get_initialized_optimizer(model, args.fused) + ckpt_state = {'model': model, 'optimizer': optimizer} + else: + ckpt_state = {'model': model} + run( + ckpt_state, + model_name, + ckpt_name, + args.folder, + args.legacy, + args.io_buffer_mb) if __name__ == "__main__": diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 20a9ea75b..8edea0612 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -2,64 +2,8 @@ import argparse import torch import os -import deepspeed -from deepspeed.ops.aio import AsyncIOBuilder - - -AIO_QUEUE_DEPTH = 8 -AIO_BLOCK_SIZE = 8*(1024**2) -AIO_THREAD_COUNT = 1 -AIO_SINGLE_SUBMIT = False -AIO_OVERLAP_EVENTS = False -PINNED_BUFFER_MB = 64 - -def _get_aio_handle(): - h = AsyncIOBuilder().load().aio_handle( - block_size=AIO_BLOCK_SIZE, - queue_depth=AIO_QUEUE_DEPTH, - single_submit=AIO_SINGLE_SUBMIT, - overlap_events=AIO_SINGLE_SUBMIT, - num_threads=AIO_THREAD_COUNT) - return h - -def test_save(file, buffer, use_zipfile, io_buffer_mb): - st = time.time() - torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - return time.time() - st - - -def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): - from deepspeed.io import MockFileWriter - st = time.time() - dsmw = MockFileWriter(file) - torch.save(f=dsmw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - write_sec = time.time() - st - dsmw._dump_state() - return write_sec - -def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): - from deepspeed.io import PyFileWriter - st = time.time() - dspw = PyFileWriter(file) - torch.save(f=dspw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - write_sec = time.time() - st - dspw._dump_state() - return write_sec - -def test_ds_aio_save(file, buffer, use_zipfile, io_buffer_mb): - h = _get_aio_handle() - pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() - from deepspeed.io import DeepSpeedFileWriter as dsfw - st = time.time() - dsfw = dsfw( - file_path=file, - aio_handle=h, - pinned_tensor=pinned_memory) - torch.save(f=dsfw, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - dsfw.close() # Force flush to storage - write_sec = time.time() - st - dsfw._dump_state() - return write_sec +from torch_save_utils import PINNED_BUFFER_MB +from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save def run(mb_size, folder, legacy_save, io_buffer_mb): buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu') @@ -68,7 +12,7 @@ def run(mb_size, folder, legacy_save, io_buffer_mb): 'test_save': test_save, 'test_ds_mock_save': test_ds_mock_save, 'test_ds_py_save': test_ds_py_save, - 'test_ds_aio_save':test_ds_aio_save + 'test_ds_fast_save':test_ds_fast_save } for tag, fn in fn_dict.items(): file = os.path.join(folder, f'{tag}_{mb_size}MB.pt') @@ -101,7 +45,6 @@ def parse_arguments(): parser.add_argument('--io_buffer_mb', type=int, default=PINNED_BUFFER_MB, - required=True, help='Size of pinned i/o buffer in MB.') args = parser.parse_args() @@ -111,7 +54,7 @@ def parse_arguments(): def main(): - print(f'Performance test of deepspeed fast checkpoint') + print(f'Performance test of deepspeed fast tensor checkpoint') args = parse_arguments() if not os.path.exists(args.folder): print(f'Invalid folder: {args.folder}') diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py new file mode 100644 index 000000000..f75499f0c --- /dev/null +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -0,0 +1,61 @@ +import time +import torch +import os +import deepspeed +from deepspeed.ops.aio import AsyncIOBuilder +from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter + + +AIO_QUEUE_DEPTH = 8 +AIO_BLOCK_SIZE = 8*(1024**2) +AIO_THREAD_COUNT = 1 +AIO_SINGLE_SUBMIT = False +AIO_OVERLAP_EVENTS = False +PINNED_BUFFER_MB = 64 + +def _get_aio_handle(): + h = AsyncIOBuilder().load().aio_handle( + block_size=AIO_BLOCK_SIZE, + queue_depth=AIO_QUEUE_DEPTH, + single_submit=AIO_SINGLE_SUBMIT, + overlap_events=AIO_SINGLE_SUBMIT, + num_threads=AIO_THREAD_COUNT) + return h + +def test_save(file, buffer, use_zipfile, io_buffer_mb): + st = time.time() + torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + return time.time() - st + + +def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): + st = time.time() + ds_mock_writer = MockFileWriter(file) + torch.save(f=ds_mock_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + ds_mock_writer.close() # Force flush to storage + write_sec = time.time() - st + ds_mock_writer._dump_state() + return write_sec + +def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): + st = time.time() + ds_py_writer = PyFileWriter(file) + torch.save(f=ds_py_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + ds_py_writer.close() # Force flush to storage + write_sec = time.time() - st + ds_py_writer._dump_state() + return write_sec + +def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb): + h = _get_aio_handle() + pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() + st = time.time() + ds_fast_writer = FastFileWriter( + file_path=file, + aio_handle=h, + pinned_tensor=pinned_memory) + torch.save(f=ds_fast_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + ds_fast_writer.close() # Force flush to storage + write_sec = time.time() - st + ds_fast_writer._dump_state() + return write_sec \ No newline at end of file From c3df4955ba214deb95713f7e3f44b0675374a60c Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Tue, 4 Jan 2022 18:27:30 -0800 Subject: [PATCH 07/40] --gpu option --- fast_io/model_checkpoint/torch_save_model.py | 77 ++++++++++--------- fast_io/model_checkpoint/torch_save_tensor.py | 32 +++++--- fast_io/model_checkpoint/torch_save_utils.py | 57 ++++++++------ 3 files changed, 96 insertions(+), 70 deletions(-) diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index 04795fa4d..d38f5af67 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -10,15 +10,16 @@ def _get_gpt_j_6B(tag): - model_name="EleutherAI/gpt-j-6B" - model = AutoModelForCausalLM.from_pretrained(model_name)#.half() - ckpt_name="gpt-j-6B" + model_name = "EleutherAI/gpt-j-6B" + model = AutoModelForCausalLM.from_pretrained(model_name) #.half() + ckpt_name = "gpt-j-6B" return model, model_name, ckpt_name + def _get_tiny_t5(tag): - model_name="hf-internal-testing/tiny-random-t5" - model = T5ForConditionalGeneration.from_pretrained(model_name)#.half() - ckpt_name="tiny-random-t5" + model_name = "hf-internal-testing/tiny-random-t5" + model = T5ForConditionalGeneration.from_pretrained(model_name) #.half() + ckpt_name = "tiny-random-t5" return model, model_name, ckpt_name @@ -26,27 +27,29 @@ def _get_hf_gpt2(tag): model_name = tag model = AutoModelForCausalLM.from_pretrained(tag) ckpt_name = tag - return model, model_name, ckpt_name + return model, model_name, ckpt_name HF_MODELS = { 'tiny-random-t5': _get_tiny_t5, 'gpt-j-6B': _get_gpt_j_6B, 'gpt2': _get_hf_gpt2, - 'gpt2-large': _get_hf_gpt2, - 'gpt2-xl': _get_hf_gpt2, + 'gpt2-large': _get_hf_gpt2, + 'gpt2-xl': _get_hf_gpt2, } + def _get_model(model_tag): return HF_MODELS[model_tag](model_tag) + def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): print(f'Model name = {model_name}') fn_dict = { - 'test_save': test_save, - 'test_ds_mock_save': test_ds_mock_save, + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, 'test_ds_py_save': test_ds_py_save, - 'test_ds_fast_save':test_ds_fast_save + 'test_ds_fast_save': test_ds_fast_save } for tag, fn in fn_dict.items(): file = os.path.join(folder, f'{tag}_{ckpt_name}.pt') @@ -56,25 +59,28 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): st = time.time() write_sec = fn(file, model, not legacy_save, io_buffer_mb) ckpt_size = os.path.getsize(file) - gb_size = ckpt_size/(1024**3) - gb_per_sec = gb_size/write_sec - print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s') + gb_size = ckpt_size / (1024**3) + gb_per_sec = gb_size / write_sec + print( + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s' + ) print(f'*********************************************') def _get_initialized_optimizer(model, fused_opt): base_optimizer = Adam(model.parameters()) import deepspeed - if fused_opt: + if fused_opt: from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper else: - from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper + from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper optimizer = FP16_Wrapper(base_optimizer) for p in model.parameters(): p.grad = torch.zeros_like(p) optimizer.step() return optimizer + def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--folder', @@ -82,26 +88,28 @@ def parse_arguments(): type=str, required=True, help='Folder to use for I/O.') - - parser.add_argument('--model', - default=None, - type=str, - required=True, - help='Hugging Face transformers tag of model (e.g., gpt2).') + + parser.add_argument( + '--model', + default=None, + type=str, + required=True, + help='Hugging Face transformers tag of model (e.g., gpt2).') parser.add_argument('--legacy', action='store_true', help='Use torch legacy save format') - + parser.add_argument('--optimizer', action='store_true', help='Include optimizer state in checkpoint.') - parser.add_argument('--fused', action='store_true', help='Use fused fp16 optimizer.') + parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') + parser.add_argument('--io_buffer_mb', type=int, default=PINNED_BUFFER_MB, @@ -116,14 +124,15 @@ def validate_arguments(args): success = True if not os.path.exists(args.folder): print(f'Invalid folder: {args.folder}') - success = False + success = False if not args.model in HF_MODELS: print(f'{args.model} is not a supported HF model tag') - success = False + success = False return success + def main(): print(f'Performance test of deepspeed fast model checkpoint') print(f'torch version = {torch.__version__}') @@ -135,19 +144,17 @@ def main(): model, model_name, ckpt_name = _get_model(args.model) if args.optimizer: - model = model.half().cuda() + model = model.half().cuda() optimizer = _get_initialized_optimizer(model, args.fused) ckpt_state = {'model': model, 'optimizer': optimizer} else: + model = model.half() + if args.gpu: + model = model.cuda() ckpt_state = {'model': model} - run( - ckpt_state, - model_name, - ckpt_name, - args.folder, - args.legacy, + run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy, args.io_buffer_mb) - + if __name__ == "__main__": main() diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 8edea0612..749c6ea31 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -5,14 +5,18 @@ from torch_save_utils import PINNED_BUFFER_MB from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save -def run(mb_size, folder, legacy_save, io_buffer_mb): - buffer = torch.randint(high=128, size=(mb_size*(1024**2), ), dtype=torch.uint8, device='cpu') + +def run(mb_size, folder, legacy_save, io_buffer_mb, device): + buffer = torch.randint(high=128, + size=(mb_size * (1024**2), ), + dtype=torch.uint8, + device=device) fn_dict = { - 'test_save': test_save, - 'test_ds_mock_save': test_ds_mock_save, + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, 'test_ds_py_save': test_ds_py_save, - 'test_ds_fast_save':test_ds_fast_save + 'test_ds_fast_save': test_ds_fast_save } for tag, fn in fn_dict.items(): file = os.path.join(folder, f'{tag}_{mb_size}MB.pt') @@ -21,11 +25,14 @@ def run(mb_size, folder, legacy_save, io_buffer_mb): os.remove(file) st = time.time() write_sec = fn(file, buffer, not legacy_save, io_buffer_mb) - gb_per_sec = mb_size/(1024.0*write_sec) - gb_size = os.path.getsize(file)/(1024**3) - print(f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s') + gb_per_sec = mb_size / (1024.0 * write_sec) + gb_size = os.path.getsize(file) / (1024**3) + print( + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s' + ) print(f'*********************************************') + def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--folder', @@ -42,6 +49,8 @@ def parse_arguments(): action='store_true', help='Use torch legacy save format') + parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') + parser.add_argument('--io_buffer_mb', type=int, default=PINNED_BUFFER_MB, @@ -52,15 +61,16 @@ def parse_arguments(): return args - def main(): print(f'Performance test of deepspeed fast tensor checkpoint') args = parse_arguments() if not os.path.exists(args.folder): print(f'Invalid folder: {args.folder}') quit() - run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb) - + + device = torch.cuda.current_device() if args.gpu else 'cpu' + run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb, device) + if __name__ == "__main__": main() diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py index f75499f0c..be86e2206 100644 --- a/fast_io/model_checkpoint/torch_save_utils.py +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -5,23 +5,23 @@ from deepspeed.ops.aio import AsyncIOBuilder from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter - AIO_QUEUE_DEPTH = 8 -AIO_BLOCK_SIZE = 8*(1024**2) +AIO_BLOCK_SIZE = 8 * (1024**2) AIO_THREAD_COUNT = 1 AIO_SINGLE_SUBMIT = False AIO_OVERLAP_EVENTS = False -PINNED_BUFFER_MB = 64 +PINNED_BUFFER_MB = 64 + def _get_aio_handle(): - h = AsyncIOBuilder().load().aio_handle( - block_size=AIO_BLOCK_SIZE, - queue_depth=AIO_QUEUE_DEPTH, - single_submit=AIO_SINGLE_SUBMIT, - overlap_events=AIO_SINGLE_SUBMIT, - num_threads=AIO_THREAD_COUNT) + h = AsyncIOBuilder().load().aio_handle(block_size=AIO_BLOCK_SIZE, + queue_depth=AIO_QUEUE_DEPTH, + single_submit=AIO_SINGLE_SUBMIT, + overlap_events=AIO_SINGLE_SUBMIT, + num_threads=AIO_THREAD_COUNT) return h + def test_save(file, buffer, use_zipfile, io_buffer_mb): st = time.time() torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) @@ -31,31 +31,40 @@ def test_save(file, buffer, use_zipfile, io_buffer_mb): def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): st = time.time() ds_mock_writer = MockFileWriter(file) - torch.save(f=ds_mock_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - ds_mock_writer.close() # Force flush to storage + torch.save(f=ds_mock_writer, + obj=buffer, + _use_new_zipfile_serialization=use_zipfile) + ds_mock_writer.close() # Force flush to storage write_sec = time.time() - st ds_mock_writer._dump_state() - return write_sec + return write_sec + def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): st = time.time() - ds_py_writer = PyFileWriter(file) - torch.save(f=ds_py_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - ds_py_writer.close() # Force flush to storage + ds_py_writer = PyFileWriter(file) + torch.save(f=ds_py_writer, + obj=buffer, + _use_new_zipfile_serialization=use_zipfile) + ds_py_writer.close() # Force flush to storage write_sec = time.time() - st ds_py_writer._dump_state() - return write_sec + return write_sec + def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb): h = _get_aio_handle() - pinned_memory = torch.zeros(io_buffer_mb*(1024**2), dtype=torch.uint8, device='cpu').pin_memory() + pinned_memory = torch.zeros(io_buffer_mb * (1024**2), + dtype=torch.uint8, + device='cpu').pin_memory() st = time.time() - ds_fast_writer = FastFileWriter( - file_path=file, - aio_handle=h, - pinned_tensor=pinned_memory) - torch.save(f=ds_fast_writer, obj=buffer, _use_new_zipfile_serialization=use_zipfile) - ds_fast_writer.close() # Force flush to storage + ds_fast_writer = FastFileWriter(file_path=file, + aio_handle=h, + pinned_tensor=pinned_memory) + torch.save(f=ds_fast_writer, + obj=buffer, + _use_new_zipfile_serialization=use_zipfile) + ds_fast_writer.close() # Force flush to storage write_sec = time.time() - st ds_fast_writer._dump_state() - return write_sec \ No newline at end of file + return write_sec From 315f02ab7bc01f3f92207cf7a29606dfc2f40e15 Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Tue, 4 Jan 2022 19:09:35 -0800 Subject: [PATCH 08/40] --half and more flexible options --- fast_io/model_checkpoint/torch_save_model.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index d38f5af67..b71450074 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -110,6 +110,10 @@ def parse_arguments(): parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') + parser.add_argument('--half', + action='store_true', + help='Use half-precision tensors.') + parser.add_argument('--io_buffer_mb', type=int, default=PINNED_BUFFER_MB, @@ -130,6 +134,11 @@ def validate_arguments(args): print(f'{args.model} is not a supported HF model tag') success = False + if args.optimizer and args.half: + if not args.gpu: + print(f'mixed precision only supported with gpu tensors') + success = False + return success @@ -143,14 +152,14 @@ def main(): quit() model, model_name, ckpt_name = _get_model(args.model) + if args.half: + model = model.half() + if args.gpu: + model = model.cuda() if args.optimizer: - model = model.half().cuda() optimizer = _get_initialized_optimizer(model, args.fused) ckpt_state = {'model': model, 'optimizer': optimizer} else: - model = model.half() - if args.gpu: - model = model.cuda() ckpt_state = {'model': model} run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy, args.io_buffer_mb) From a41ba080139427a21ea90f92aebadcb2ed8e910d Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Sat, 8 Jan 2022 10:45:49 -0800 Subject: [PATCH 09/40] Add deepspeed.save_checkpoint() --- .../model_checkpoint/deepspeed_save_model.py | 117 ++++++++++++++++++ fast_io/model_checkpoint/save_model_utils.py | 114 +++++++++++++++++ fast_io/model_checkpoint/torch_save_model.py | 114 ++--------------- fast_io/model_checkpoint/torch_save_tensor.py | 23 ++-- fast_io/model_checkpoint/torch_save_utils.py | 19 +-- 5 files changed, 268 insertions(+), 119 deletions(-) create mode 100644 fast_io/model_checkpoint/deepspeed_save_model.py create mode 100644 fast_io/model_checkpoint/save_model_utils.py diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py new file mode 100644 index 000000000..a9c9db2f6 --- /dev/null +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -0,0 +1,117 @@ +import time +import torch +import os +import shutil +import deepspeed +from save_model_utils import get_model, validate_arguments, parse_arguments + + +def _get_ds_config(args, writer_type): + ds_config = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": args.zero_stage, + "cpu_offload": args.cpu_offload + }, + "fp16": { + "enabled": args.half + }, + "optimizer": { + "type": "Adam", + "params": { + "torch_adam": not args.fused + } + }, + "checkpoint": { + "checkpoint_serialization": not args.legacy + }, + "aio": { + "block_size": 8 * (1024**2), + "queue_depth": 8, + "single_submit": False, + "overlap_events": False, + "thread_count": 1, + } + } + + if writer_type: + ds_config["checkpoint"]["writer"] = { + "type": writer_type, + "io_buffer_size": args.io_buffer_mb * (1024**2), + "show_statistics": not args.no_statistics + } + + return ds_config + + +def _get_ds_engine(model, ds_config): + ds_engine, _, _, _ = deepspeed.initialize( + model=model, model_parameters=model.parameters(), config=ds_config) + + return ds_engine + + +def _do_optimizer_step(ds_engine): + for p in ds_engine.module.parameters(): + p.grad = torch.zeros_like(p) + ds_engine.step() + + +def test_save(tag, folder, model, args, writer_type): + ds_config = _get_ds_config(args, writer_type) + ds_engine = _get_ds_engine(model, ds_config) + if args.zero_stage == 0: + _do_optimizer_step(ds_engine) + + st = time.time() + ds_engine.save_checkpoint(save_dir=folder, tag=tag) + write_sec = time.time() - st + return write_sec + + +def _get_folder_size(folder): + size = 0 + for path, _, files in os.walk(folder): + size += sum([os.path.getsize(os.path.join(path, f)) for f in files]) + return size + + +def run(model, model_name, ckpt_name, args): + print(f'Model name = {model_name}') + writer_dict = { + 'test_save': None, + 'test_ds_mock_save': 'mock', + 'test_ds_py_save': 'python', + 'test_ds_fast_save': 'fast' + } + for tag, writer_type in writer_dict.items(): + folder = os.path.join(args.folder, ckpt_name, tag) + if os.path.exists(folder): + shutil.rmtree(folder) + write_sec = test_save(tag, folder, model, args, writer_type) + ckpt_size = _get_folder_size(folder) + gb_size = ckpt_size / (1024**3) + gb_per_sec = gb_size / write_sec + print( + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s' + ) + print(f'*********************************************') + + +def main(): + print( + f'Performance test of deepspeed integration of fast model checkpointing.' + ) + print(f'torch version = {torch.__version__}') + torch.manual_seed(42) + + args = parse_arguments() + if not validate_arguments(args): + quit() + + model, model_name, ckpt_name = get_model(args.model) + run(model, model_name, ckpt_name, args) + + +if __name__ == "__main__": + main() diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py new file mode 100644 index 000000000..c19062d13 --- /dev/null +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -0,0 +1,114 @@ +import argparse +import os +from transformers import AutoModelForCausalLM +from transformers import T5ForConditionalGeneration +from torch_save_utils import PINNED_BUFFER_MB + + +def _get_gpt_j_6B(tag): + model_name = "EleutherAI/gpt-j-6B" + model = AutoModelForCausalLM.from_pretrained(model_name) + ckpt_name = "gpt-j-6B" + return model, model_name, ckpt_name + + +def _get_tiny_t5(tag): + model_name = "hf-internal-testing/tiny-random-t5" + model = T5ForConditionalGeneration.from_pretrained(model_name) + ckpt_name = "tiny-random-t5" + return model, model_name, ckpt_name + + +def _get_hf_gpt2(tag): + model_name = tag + model = AutoModelForCausalLM.from_pretrained(tag) + ckpt_name = tag + return model, model_name, ckpt_name + + +HF_MODELS = { + 'tiny-t5': _get_tiny_t5, + 'gpt-j-6B': _get_gpt_j_6B, + 'gpt2': _get_hf_gpt2, + 'gpt2-large': _get_hf_gpt2, + 'gpt2-xl': _get_hf_gpt2, +} + + +def get_model(model_tag): + return HF_MODELS[model_tag](model_tag) + + +def validate_arguments(args): + success = True + if not os.path.exists(args.folder): + print(f'Invalid folder: {args.folder}') + success = False + + if not args.model in HF_MODELS: + print(f'{args.model} is not a supported HF model tag') + success = False + + if args.optimizer and args.half: + if not args.gpu: + print(f'mixed precision only supported with gpu tensors') + success = False + + return success + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--folder', + default=None, + type=str, + required=True, + help='Folder to use for I/O.') + + parser.add_argument( + '--model', + default=None, + type=str, + required=True, + help='Hugging Face transformers tag of model (e.g., gpt2).') + + parser.add_argument('--legacy', + action='store_true', + help='Use torch legacy save format') + + parser.add_argument('--optimizer', + action='store_true', + help='Include optimizer state in checkpoint.') + + parser.add_argument('--fused', + action='store_true', + help='Use fused fp16 optimizer.') + + parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') + + parser.add_argument('--half', + action='store_true', + help='Use half-precision tensors.') + + parser.add_argument( + '--io_buffer_mb', + type=int, + default=PINNED_BUFFER_MB, + help=f'Size of pinned i/o buffer in MB. Default = {PINNED_BUFFER_MB}') + + parser.add_argument('--zero_stage', + type=int, + default=0, + help='ZeRO optimization stage. Default = 0') + + parser.add_argument('--cpu_offload', + action='store_true', + help='Enable CPU offload of optimizer state.') + + parser.add_argument('--no-statistics', + action='store_true', + help='Suppress low-level performance statistics.') + + args = parser.parse_args() + print(f'args = {args}') + return args diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index b71450074..1375f048b 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -1,49 +1,13 @@ import time -import argparse import torch from torch.optim import Adam import os -from transformers import AutoModelForCausalLM -from transformers import T5ForConditionalGeneration -from torch_save_utils import PINNED_BUFFER_MB from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save +from save_model_utils import get_model, validate_arguments, parse_arguments -def _get_gpt_j_6B(tag): - model_name = "EleutherAI/gpt-j-6B" - model = AutoModelForCausalLM.from_pretrained(model_name) #.half() - ckpt_name = "gpt-j-6B" - return model, model_name, ckpt_name - - -def _get_tiny_t5(tag): - model_name = "hf-internal-testing/tiny-random-t5" - model = T5ForConditionalGeneration.from_pretrained(model_name) #.half() - ckpt_name = "tiny-random-t5" - return model, model_name, ckpt_name - - -def _get_hf_gpt2(tag): - model_name = tag - model = AutoModelForCausalLM.from_pretrained(tag) - ckpt_name = tag - return model, model_name, ckpt_name - - -HF_MODELS = { - 'tiny-random-t5': _get_tiny_t5, - 'gpt-j-6B': _get_gpt_j_6B, - 'gpt2': _get_hf_gpt2, - 'gpt2-large': _get_hf_gpt2, - 'gpt2-xl': _get_hf_gpt2, -} - - -def _get_model(model_tag): - return HF_MODELS[model_tag](model_tag) - - -def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): +def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb, + show_statistics): print(f'Model name = {model_name}') fn_dict = { 'test_save': test_save, @@ -57,7 +21,8 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb): if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, model, not legacy_save, io_buffer_mb) + write_sec = fn(file, model, not legacy_save, io_buffer_mb, + show_statistics) ckpt_size = os.path.getsize(file) gb_size = ckpt_size / (1024**3) gb_per_sec = gb_size / write_sec @@ -81,69 +46,10 @@ def _get_initialized_optimizer(model, fused_opt): return optimizer -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--folder', - default=None, - type=str, - required=True, - help='Folder to use for I/O.') - - parser.add_argument( - '--model', - default=None, - type=str, - required=True, - help='Hugging Face transformers tag of model (e.g., gpt2).') - - parser.add_argument('--legacy', - action='store_true', - help='Use torch legacy save format') - - parser.add_argument('--optimizer', - action='store_true', - help='Include optimizer state in checkpoint.') - - parser.add_argument('--fused', - action='store_true', - help='Use fused fp16 optimizer.') - - parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') - - parser.add_argument('--half', - action='store_true', - help='Use half-precision tensors.') - - parser.add_argument('--io_buffer_mb', - type=int, - default=PINNED_BUFFER_MB, - help='Size of pinned i/o buffer in MB.') - - args = parser.parse_args() - print(f'args = {args}') - return args - - -def validate_arguments(args): - success = True - if not os.path.exists(args.folder): - print(f'Invalid folder: {args.folder}') - success = False - - if not args.model in HF_MODELS: - print(f'{args.model} is not a supported HF model tag') - success = False - - if args.optimizer and args.half: - if not args.gpu: - print(f'mixed precision only supported with gpu tensors') - success = False - - return success - - def main(): - print(f'Performance test of deepspeed fast model checkpoint') + print( + f'Performance test of torch.save() integration of fast model checkpointing.' + ) print(f'torch version = {torch.__version__}') torch.manual_seed(42) @@ -151,7 +57,7 @@ def main(): if not validate_arguments(args): quit() - model, model_name, ckpt_name = _get_model(args.model) + model, model_name, ckpt_name = get_model(args.model) if args.half: model = model.half() if args.gpu: @@ -162,7 +68,7 @@ def main(): else: ckpt_state = {'model': model} run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy, - args.io_buffer_mb) + args.io_buffer_mb, not args.no_statistics) if __name__ == "__main__": diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 749c6ea31..0ecf0e6ef 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -6,9 +6,10 @@ from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save -def run(mb_size, folder, legacy_save, io_buffer_mb, device): +def run(args): + device = torch.cuda.current_device() if args.gpu else 'cpu' buffer = torch.randint(high=128, - size=(mb_size * (1024**2), ), + size=(args.mb_size * (1024**2), ), dtype=torch.uint8, device=device) @@ -19,13 +20,14 @@ def run(mb_size, folder, legacy_save, io_buffer_mb, device): 'test_ds_fast_save': test_ds_fast_save } for tag, fn in fn_dict.items(): - file = os.path.join(folder, f'{tag}_{mb_size}MB.pt') + file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt') print(f'checkpoint file = {file}') if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, buffer, not legacy_save, io_buffer_mb) - gb_per_sec = mb_size / (1024.0 * write_sec) + write_sec = fn(file, buffer, not args.legacy, args.io_buffer_mb, + not args.no_statistics) + gb_per_sec = args.mb_size / (1024.0 * write_sec) gb_size = os.path.getsize(file) / (1024**3) print( f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s' @@ -56,20 +58,25 @@ def parse_arguments(): default=PINNED_BUFFER_MB, help='Size of pinned i/o buffer in MB.') + parser.add_argument('--no-statistics', + action='store_true', + help='Suppress low-level performance statistics.') + args = parser.parse_args() print(f'args = {args}') return args def main(): - print(f'Performance test of deepspeed fast tensor checkpoint') + print( + f'Performance test of torch.save() integration of fast tensor checkpointing.' + ) args = parse_arguments() if not os.path.exists(args.folder): print(f'Invalid folder: {args.folder}') quit() - device = torch.cuda.current_device() if args.gpu else 'cpu' - run(args.mb_size, args.folder, args.legacy, args.io_buffer_mb, device) + run(args) if __name__ == "__main__": diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py index be86e2206..3455a1a60 100644 --- a/fast_io/model_checkpoint/torch_save_utils.py +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -22,13 +22,14 @@ def _get_aio_handle(): return h -def test_save(file, buffer, use_zipfile, io_buffer_mb): +def test_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics): st = time.time() torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) return time.time() - st -def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): +def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb, + show_statistics): st = time.time() ds_mock_writer = MockFileWriter(file) torch.save(f=ds_mock_writer, @@ -36,11 +37,12 @@ def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb): _use_new_zipfile_serialization=use_zipfile) ds_mock_writer.close() # Force flush to storage write_sec = time.time() - st - ds_mock_writer._dump_state() + if show_statistics: + ds_mock_writer._dump_state() return write_sec -def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): +def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics): st = time.time() ds_py_writer = PyFileWriter(file) torch.save(f=ds_py_writer, @@ -48,11 +50,13 @@ def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb): _use_new_zipfile_serialization=use_zipfile) ds_py_writer.close() # Force flush to storage write_sec = time.time() - st - ds_py_writer._dump_state() + if show_statistics: + ds_py_writer._dump_state() return write_sec -def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb): +def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb, + show_statistics): h = _get_aio_handle() pinned_memory = torch.zeros(io_buffer_mb * (1024**2), dtype=torch.uint8, @@ -66,5 +70,6 @@ def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb): _use_new_zipfile_serialization=use_zipfile) ds_fast_writer.close() # Force flush to storage write_sec = time.time() - st - ds_fast_writer._dump_state() + if show_statistics: + ds_fast_writer._dump_state() return write_sec From 4fcb06040b185f08f6273736054cbef08eb8e2a0 Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Sat, 8 Jan 2022 12:46:06 -0800 Subject: [PATCH 10/40] Free ds memory --- fast_io/model_checkpoint/deepspeed_save_model.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index a9c9db2f6..7cad72f36 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -2,6 +2,7 @@ import torch import os import shutil +import gc import deepspeed from save_model_utils import get_model, validate_arguments, parse_arguments @@ -57,6 +58,16 @@ def _do_optimizer_step(ds_engine): ds_engine.step() +def _free_ds_memory(ds_engine): + ds_engine.optimizer.optimizer = None + ds_engine.optimizer = None + ds_engine.module = None + ds_engine = None + del ds_engine + gc.collect() + torch.cuda.empty_cache() + + def test_save(tag, folder, model, args, writer_type): ds_config = _get_ds_config(args, writer_type) ds_engine = _get_ds_engine(model, ds_config) @@ -66,6 +77,7 @@ def test_save(tag, folder, model, args, writer_type): st = time.time() ds_engine.save_checkpoint(save_dir=folder, tag=tag) write_sec = time.time() - st + _free_ds_memory(ds_engine) return write_sec From a49c5424977196ee0bc0f55dcfbec5591315294a Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Sat, 8 Jan 2022 13:24:24 -0800 Subject: [PATCH 11/40] Improve repro --- fast_io/model_checkpoint/deepspeed_save_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index 7cad72f36..4cf6b4ae1 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -3,6 +3,8 @@ import os import shutil import gc +import random +import numpy as np import deepspeed from save_model_utils import get_model, validate_arguments, parse_arguments @@ -116,7 +118,8 @@ def main(): ) print(f'torch version = {torch.__version__}') torch.manual_seed(42) - + np.random.seed(0) + random.seed(0) args = parse_arguments() if not validate_arguments(args): quit() From 233b9e92bb7fa63bc6609a0f652418bf4d53ca4e Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 22 Feb 2022 12:13:57 -0800 Subject: [PATCH 12/40] Double I/O buffer (#56) --- .../model_checkpoint/deepspeed_save_model.py | 3 +- fast_io/model_checkpoint/save_model_utils.py | 4 +++ fast_io/model_checkpoint/torch_save_model.py | 11 +++---- fast_io/model_checkpoint/torch_save_tensor.py | 7 +++-- fast_io/model_checkpoint/torch_save_utils.py | 31 ++++++++++--------- 5 files changed, 31 insertions(+), 25 deletions(-) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index 4cf6b4ae1..081fe0299 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -41,7 +41,8 @@ def _get_ds_config(args, writer_type): ds_config["checkpoint"]["writer"] = { "type": writer_type, "io_buffer_size": args.io_buffer_mb * (1024**2), - "show_statistics": not args.no_statistics + "io_buffer_double": not args.single_io_buffer, + "show_statistics": not args.no_statistics, } return ds_config diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index c19062d13..24f9f87d1 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -109,6 +109,10 @@ def parse_arguments(): action='store_true', help='Suppress low-level performance statistics.') + parser.add_argument('--single_io_buffer', + action='store_true', + help='Disable double buffering of i/o buffer.') + args = parser.parse_args() print(f'args = {args}') return args diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index 1375f048b..245d49e30 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -6,8 +6,7 @@ from save_model_utils import get_model, validate_arguments, parse_arguments -def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb, - show_statistics): +def run(model, model_name, ckpt_name, args): print(f'Model name = {model_name}') fn_dict = { 'test_save': test_save, @@ -16,13 +15,12 @@ def run(model, model_name, ckpt_name, folder, legacy_save, io_buffer_mb, 'test_ds_fast_save': test_ds_fast_save } for tag, fn in fn_dict.items(): - file = os.path.join(folder, f'{tag}_{ckpt_name}.pt') + file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt') print(f'checkpoint file = {file}') if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, model, not legacy_save, io_buffer_mb, - show_statistics) + write_sec = fn(file, model, args) ckpt_size = os.path.getsize(file) gb_size = ckpt_size / (1024**3) gb_per_sec = gb_size / write_sec @@ -67,8 +65,7 @@ def main(): ckpt_state = {'model': model, 'optimizer': optimizer} else: ckpt_state = {'model': model} - run(ckpt_state, model_name, ckpt_name, args.folder, args.legacy, - args.io_buffer_mb, not args.no_statistics) + run(ckpt_state, model_name, ckpt_name, args) if __name__ == "__main__": diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 0ecf0e6ef..80d5f1358 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -25,8 +25,7 @@ def run(args): if os.path.isfile(file): os.remove(file) st = time.time() - write_sec = fn(file, buffer, not args.legacy, args.io_buffer_mb, - not args.no_statistics) + write_sec = fn(file, buffer, args) gb_per_sec = args.mb_size / (1024.0 * write_sec) gb_size = os.path.getsize(file) / (1024**3) print( @@ -62,6 +61,10 @@ def parse_arguments(): action='store_true', help='Suppress low-level performance statistics.') + parser.add_argument('--single_io_buffer', + action='store_true', + help='Disable double buffering of i/o buffer.') + args = parser.parse_args() print(f'args = {args}') return args diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py index 3455a1a60..c01fd014c 100644 --- a/fast_io/model_checkpoint/torch_save_utils.py +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -22,54 +22,55 @@ def _get_aio_handle(): return h -def test_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics): +def test_save(file, buffer, args): st = time.time() - torch.save(f=file, obj=buffer, _use_new_zipfile_serialization=use_zipfile) + torch.save(f=file, + obj=buffer, + _use_new_zipfile_serialization=not args.legacy) return time.time() - st -def test_ds_mock_save(file, buffer, use_zipfile, io_buffer_mb, - show_statistics): +def test_ds_mock_save(file, buffer, args): st = time.time() ds_mock_writer = MockFileWriter(file) torch.save(f=ds_mock_writer, obj=buffer, - _use_new_zipfile_serialization=use_zipfile) + _use_new_zipfile_serialization=not args.legacy) ds_mock_writer.close() # Force flush to storage write_sec = time.time() - st - if show_statistics: + if not args.no_statistics: ds_mock_writer._dump_state() return write_sec -def test_ds_py_save(file, buffer, use_zipfile, io_buffer_mb, show_statistics): +def test_ds_py_save(file, buffer, args): st = time.time() ds_py_writer = PyFileWriter(file) torch.save(f=ds_py_writer, obj=buffer, - _use_new_zipfile_serialization=use_zipfile) + _use_new_zipfile_serialization=not args.legacy) ds_py_writer.close() # Force flush to storage write_sec = time.time() - st - if show_statistics: + if not args.no_statistics: ds_py_writer._dump_state() return write_sec -def test_ds_fast_save(file, buffer, use_zipfile, io_buffer_mb, - show_statistics): +def test_ds_fast_save(file, buffer, args): h = _get_aio_handle() - pinned_memory = torch.zeros(io_buffer_mb * (1024**2), + pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2), dtype=torch.uint8, device='cpu').pin_memory() st = time.time() ds_fast_writer = FastFileWriter(file_path=file, aio_handle=h, - pinned_tensor=pinned_memory) + pinned_tensor=pinned_memory, + double_buffer=not args.single_io_buffer) torch.save(f=ds_fast_writer, obj=buffer, - _use_new_zipfile_serialization=use_zipfile) + _use_new_zipfile_serialization=not args.legacy) ds_fast_writer.close() # Force flush to storage write_sec = time.time() - st - if show_statistics: + if not args.no_statistics: ds_fast_writer._dump_state() return write_sec From b1f02b21b0643083a982d8f9f2cebc640d6a2f5b Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 11 Mar 2022 13:07:03 -0800 Subject: [PATCH 13/40] Double I/O buffer (#60) From a16ac9eed8f8401cb265edcd639d4d6c3572c083 Mon Sep 17 00:00:00 2001 From: jerryyangli Date: Tue, 15 Mar 2022 06:51:29 -0700 Subject: [PATCH 14/40] Add checkpoint comparison (#62) * Add checkpoint comparison * Corrected a typo Co-authored-by: Yang Li --- .../model_checkpoint/checkpoint_compare.py | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 fast_io/model_checkpoint/checkpoint_compare.py diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/fast_io/model_checkpoint/checkpoint_compare.py new file mode 100644 index 000000000..cc67b61d9 --- /dev/null +++ b/fast_io/model_checkpoint/checkpoint_compare.py @@ -0,0 +1,123 @@ +#This script is for testing whether two checkpoints match; it prints all the differences + +import torch +import os +import sys +import pickle +from collections import OrderedDict + +exclude_key_str = {'ds_config/checkpoint/writer'} + +def main(): + dir1 = sys.argv[1] + dir2 = sys.argv[2] + print ("Begin comparison") + print ("The first directory {}" .format(dir1)) + print ("The second directory {}" .format(dir2)) + print (' ') + + file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))] + file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))] + common_files = [] + + for f in file_list1: + if not (f in file_list2): + log_error_file_mismatch_first(f) + else: + common_files.append(f) + for f in file_list2: + if not (f in file_list1): + log_error_file_mismatch_second(f) + + for f in common_files: + full_dir1 = os.path.join(dir1, f) + full_dir2 = os.path.join(dir2, f) + print ("Begin comparison") + print("The first checkpoint {}" .format(full_dir1)) + print("The second checkpoint {}" .format(full_dir2)) + print(' ') + model_first = torch.load(full_dir1) + model_second = torch.load(full_dir2) + object_compare(model_first, model_second, []) + + +def object_compare(model_first, model_second, key_chain): + if not (type(model_first) == type(model_second)): + log_error_value_mismatch(model_first, model_second, key_chain) + return + + if type(model_first) is list: + if len(model_first) != len(model_second): + log_error_value_mismatch(model_first, model_second, key_chain) + return + for i in range(len(model_first)): + object_compare(model_first[i], model_second[i], key_chain) + return + + if type(model_first) is dict or type(model_first) is OrderedDict: + common_keys = [] + for key in model_first: + if key not in model_second: + key_chain.append(key) + log_error_key_mismatch_first(model_first[key], key_chain) + key_chain.pop() + else: + common_keys.append(key) + + for key in model_second: + if key not in model_first: + key_chain.append(key) + log_error_key_mismatch_second(model_second[key], key_chain) + key_chain.pop() + + for key in common_keys: + key_chain.append(key) + object_compare(model_first[key], model_second[key], key_chain) + key_chain.pop() + return + + if hasattr(model_first, '__dict__'): + equality = (model_first.__dict__ == model_second.__dict__) + else: + equality = (model_first == model_second) + if type(equality) is not bool: + equality = (equality.all()) + if not equality: + log_error_value_mismatch(model_first, model_second, key_chain) + return + + +def log_error_file_mismatch_first(filename): + print("The following file appeared in the first but not the second directory: {}" .format(filename)) + print(' ') + + +def log_error_file_mismatch_second(filename): + print("The following key appeared in the second but not the first directory: {}" .format(filename)) + print(" ") + + +def log_error_key_mismatch_first(model, key_chain): + key_str = "/".join(key_chain) + if not (key_str in exclude_key_str): + print("The following key appeared in the first but not the second model: {}" .format(key_str)) + print("The value of the first model is: {}" .format(model)) + print(" ") + + +def log_error_key_mismatch_second(model, key_chain): + key_str = "/".join(key_chain) + if not (key_str in exclude_key_str): + print("The following key appeared in the second but not the first model: {}" .format(key_str)) + print("The value of the second model is: {}" .format(model)) + print(" ") + + +def log_error_value_mismatch(model_first, model_second, key_chain): + print ("The values of the following key do not match: {}" .format("/".join(key_chain))) + print ("The value of the first model is: {}" .format(model_first)) + print ("The value of the second model is: {}" .format(model_second)) + print(" ") + +if __name__ == "__main__": + main() From b945adc06ddea9a646a91a785e119b8c10b3ab27 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 19 Mar 2022 16:47:46 +0000 Subject: [PATCH 15/40] save_checkpoint perf monitoring --- Megatron-LM-v1.1.5-ZeRO3/megatron/training.py | 28 +++++++++++++------ Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py | 19 +++++++++++++ 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py index 0245caea4..68ce5c4c4 100644 --- a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py +++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py @@ -38,7 +38,7 @@ from megatron.model.realm_model import ICTBertModel from megatron.utils import check_adlr_autoresume_termination from megatron.utils import make_data_loader -from megatron.utils import report_memory, flops_calculator +from megatron.utils import report_memory, flops_calculator, throughput_calculator, checkpoint_throughput_calculator import deepspeed from deepspeed.runtime.utils import see_memory_usage @@ -384,6 +384,7 @@ def add_to_logging(name): add_to_logging('backward-clip-grad') add_to_logging('optimizer') add_to_logging('batch generator') + add_to_logging('save checkpoint') # Tensorboard values. if writer and torch.distributed.get_rank() == 0: @@ -423,12 +424,14 @@ def add_to_logging(name): total_loss_dict[got_nan_key]) total_loss_dict[skipped_iters_key] = 0 total_loss_dict[got_nan_key] = 0 + timers.log(timers_to_log, normalizer=args.log_interval) print_rank_0(log_string) if report_memory_flag: report_memory('after {} iterations'.format(iteration)) report_memory_flag = False - timers.log(timers_to_log, normalizer=args.log_interval) + flops_calculator(model, args, elapsed_time) + throughput_calculator(model, args, elapsed_time) return report_memory_flag @@ -462,11 +465,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler, loss_scale = None if args.fp16: loss_scale = optimizer.cur_scale if args.deepspeed else optimizer.loss_scale - report_memory_flag = training_log(loss_dict, total_loss_dict, - optimizer.param_groups[0]['lr'], - iteration, loss_scale, - report_memory_flag, skipped_iter, - model=model) # Autoresume if args.adlr_autoresume and \ @@ -475,9 +473,21 @@ def train(forward_step_func, model, optimizer, lr_scheduler, lr_scheduler) # Checkpointing - if args.save and args.save_interval and \ - iteration % args.save_interval == 0: + should_save_checkpoint = args.save and args.save_interval and \ + iteration % args.save_interval == 0 + timers('save checkpoint').start() + if should_save_checkpoint: save_checkpoint(iteration, model, optimizer, lr_scheduler) + timers('save checkpoint').stop() + + if should_save_checkpoint: + checkpoint_throughput_calculator(model, args, timers('save checkpoint').elapsed(reset=False)) + + report_memory_flag = training_log(loss_dict, total_loss_dict, + optimizer.param_groups[0]['lr'], + iteration, loss_scale, + report_memory_flag, skipped_iter, + model=model) # Evaluation # XXX temporarily disabled for ZeRO-3 diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py index 86fcf5ed4..9880f12ca 100644 --- a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py +++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py @@ -194,3 +194,22 @@ def flops_calculator(model, args, iteration_time): effective_tera_flops_per_gpu = giga_flops_per_model_per_train_step / (iteration_time * 1000.0 * gpus_per_model) print_rank_0(f"Effective Tera Flops per GPU: {round(effective_tera_flops_per_gpu, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B") + + +def throughput_calculator(model, args, iteration_time): + gpus_per_model = torch.distributed.get_world_size(group = mpu.get_model_parallel_group()) + samples_per_model = args.batch_size * args.seq_length + model_replica_count = torch.distributed.get_world_size() / gpus_per_model + approx_parameters_in_billions = get_parameters_in_billions(model) + samples_per_second = samples_per_model * model_replica_count / (iteration_time * 1000.0) + + print_rank_0(f'Samples per second: {round(samples_per_second, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B') + + +def checkpoint_throughput_calculator(model, args, latency_sec): + approx_parameters_in_billions = get_parameters_in_billions(model) + checkpoint_multiplier = 12 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4) + checkpoint_giga_bytes = approx_parameters_in_billions * checkpoint_multiplier + giga_bytes_per_second = checkpoint_giga_bytes / latency_sec + + print_rank_0(f'Checkpoint Save GB: {round(checkpoint_giga_bytes, 3)}, GB_PerSec: {round(giga_bytes_per_second, 2)}, Latency(secs): {round(latency_sec, 3)}') From 64a8f75ccc05de25fe691320e961f81a7a113601 Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Tue, 22 Mar 2022 18:36:02 +0000 Subject: [PATCH 16/40] Disable checkpoint save on exit --- Megatron-LM-v1.1.5-ZeRO3/megatron/training.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py index 68ce5c4c4..bda88cc25 100644 --- a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py +++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py @@ -106,8 +106,8 @@ def pretrain(train_valid_test_dataset_provider, model_provider, valid_data_iterator, model, iteration, False) - if args.save and iteration != 0: - save_checkpoint(iteration, model, optimizer, lr_scheduler) +# if args.save and iteration != 0: +# save_checkpoint(iteration, model, optimizer, lr_scheduler) if args.do_test: # Run on test data. @@ -175,8 +175,8 @@ def get_optimizer(model): weight_decay=args.weight_decay) else: # Use torch Adam instead of Fused Adam from NVIDIA which seems to have some issue. - #optimizer = Adam(param_groups, - optimizer = torch.optim.AdamW(param_groups, + optimizer = Adam(param_groups, + #optimizer = torch.optim.AdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), From 44b8664274fd06176269d6c4777a7292bb3fddbd Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 22 Mar 2022 11:41:32 -0700 Subject: [PATCH 17/40] Perf statistics for save_checkpoint (#64) * save_checkpoint perf monitoring * Disable checkpoint save on exit --- Megatron-LM-v1.1.5-ZeRO3/megatron/training.py | 36 ++++++++++++------- Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py | 19 ++++++++++ 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py index 0245caea4..bda88cc25 100644 --- a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py +++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py @@ -38,7 +38,7 @@ from megatron.model.realm_model import ICTBertModel from megatron.utils import check_adlr_autoresume_termination from megatron.utils import make_data_loader -from megatron.utils import report_memory, flops_calculator +from megatron.utils import report_memory, flops_calculator, throughput_calculator, checkpoint_throughput_calculator import deepspeed from deepspeed.runtime.utils import see_memory_usage @@ -106,8 +106,8 @@ def pretrain(train_valid_test_dataset_provider, model_provider, valid_data_iterator, model, iteration, False) - if args.save and iteration != 0: - save_checkpoint(iteration, model, optimizer, lr_scheduler) +# if args.save and iteration != 0: +# save_checkpoint(iteration, model, optimizer, lr_scheduler) if args.do_test: # Run on test data. @@ -175,8 +175,8 @@ def get_optimizer(model): weight_decay=args.weight_decay) else: # Use torch Adam instead of Fused Adam from NVIDIA which seems to have some issue. - #optimizer = Adam(param_groups, - optimizer = torch.optim.AdamW(param_groups, + optimizer = Adam(param_groups, + #optimizer = torch.optim.AdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), @@ -384,6 +384,7 @@ def add_to_logging(name): add_to_logging('backward-clip-grad') add_to_logging('optimizer') add_to_logging('batch generator') + add_to_logging('save checkpoint') # Tensorboard values. if writer and torch.distributed.get_rank() == 0: @@ -423,12 +424,14 @@ def add_to_logging(name): total_loss_dict[got_nan_key]) total_loss_dict[skipped_iters_key] = 0 total_loss_dict[got_nan_key] = 0 + timers.log(timers_to_log, normalizer=args.log_interval) print_rank_0(log_string) if report_memory_flag: report_memory('after {} iterations'.format(iteration)) report_memory_flag = False - timers.log(timers_to_log, normalizer=args.log_interval) + flops_calculator(model, args, elapsed_time) + throughput_calculator(model, args, elapsed_time) return report_memory_flag @@ -462,11 +465,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler, loss_scale = None if args.fp16: loss_scale = optimizer.cur_scale if args.deepspeed else optimizer.loss_scale - report_memory_flag = training_log(loss_dict, total_loss_dict, - optimizer.param_groups[0]['lr'], - iteration, loss_scale, - report_memory_flag, skipped_iter, - model=model) # Autoresume if args.adlr_autoresume and \ @@ -475,9 +473,21 @@ def train(forward_step_func, model, optimizer, lr_scheduler, lr_scheduler) # Checkpointing - if args.save and args.save_interval and \ - iteration % args.save_interval == 0: + should_save_checkpoint = args.save and args.save_interval and \ + iteration % args.save_interval == 0 + timers('save checkpoint').start() + if should_save_checkpoint: save_checkpoint(iteration, model, optimizer, lr_scheduler) + timers('save checkpoint').stop() + + if should_save_checkpoint: + checkpoint_throughput_calculator(model, args, timers('save checkpoint').elapsed(reset=False)) + + report_memory_flag = training_log(loss_dict, total_loss_dict, + optimizer.param_groups[0]['lr'], + iteration, loss_scale, + report_memory_flag, skipped_iter, + model=model) # Evaluation # XXX temporarily disabled for ZeRO-3 diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py index 86fcf5ed4..9880f12ca 100644 --- a/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py +++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/utils.py @@ -194,3 +194,22 @@ def flops_calculator(model, args, iteration_time): effective_tera_flops_per_gpu = giga_flops_per_model_per_train_step / (iteration_time * 1000.0 * gpus_per_model) print_rank_0(f"Effective Tera Flops per GPU: {round(effective_tera_flops_per_gpu, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B") + + +def throughput_calculator(model, args, iteration_time): + gpus_per_model = torch.distributed.get_world_size(group = mpu.get_model_parallel_group()) + samples_per_model = args.batch_size * args.seq_length + model_replica_count = torch.distributed.get_world_size() / gpus_per_model + approx_parameters_in_billions = get_parameters_in_billions(model) + samples_per_second = samples_per_model * model_replica_count / (iteration_time * 1000.0) + + print_rank_0(f'Samples per second: {round(samples_per_second, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B') + + +def checkpoint_throughput_calculator(model, args, latency_sec): + approx_parameters_in_billions = get_parameters_in_billions(model) + checkpoint_multiplier = 12 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4) + checkpoint_giga_bytes = approx_parameters_in_billions * checkpoint_multiplier + giga_bytes_per_second = checkpoint_giga_bytes / latency_sec + + print_rank_0(f'Checkpoint Save GB: {round(checkpoint_giga_bytes, 3)}, GB_PerSec: {round(giga_bytes_per_second, 2)}, Latency(secs): {round(latency_sec, 3)}') From ff4bd69edb15acbd94881e73b41c1c54e50dec95 Mon Sep 17 00:00:00 2001 From: GuanhuaWang Date: Wed, 21 Sep 2022 18:48:16 +0000 Subject: [PATCH 18/40] add logs for a100-80 --- .../log_9_21_22/gpt2-unfused.txt | 599 ++++++++++++++ .../log_9_21_22/gpt2_fused_z2.txt | 781 ++++++++++++++++++ 2 files changed, 1380 insertions(+) create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt new file mode 100644 index 000000000..33985e8db --- /dev/null +++ b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt @@ -0,0 +1,599 @@ +Performance test of deepspeed integration of fast model checkpointing. +torch version = 1.12.0+cu113 +args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) +Model name = gpt2-large +[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... +[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 +[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl +[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +NCCL version 2.10.3+cuda11.3 +[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:22,130] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... +Building extension module utils... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module utils... +Time to load utils op: 0.3399326801300049 seconds +[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt +test_save -- 10.13 GB, 6.83 secs, 1.48 gb/s +********************************************* +[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:30,659] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "mock", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0004949569702148438 seconds +[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} +test_ds_mock_save -- 0.00 GB, 0.93 secs, 0.00 gb/s +********************************************* +[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:33,250] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "python", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.000392913818359375 seconds +[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196} +test_ds_py_save -- 10.13 GB, 6.32 secs, 1.60 gb/s +********************************************* +[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... +Building extension module async_io... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module async_io... +Time to load async_io op: 0.4869067668914795 seconds +[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:41,332] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "fast", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0004849433898925781 seconds +[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0003745555877685547 seconds +stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0} +test_ds_fast_save -- 10.13 GB, 3.00 secs, 3.38 gb/s +********************************************* diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt new file mode 100644 index 000000000..9871b634e --- /dev/null +++ b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt @@ -0,0 +1,781 @@ +Performance test of deepspeed integration of fast model checkpointing. +torch version = 1.12.0+cu113 +args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2) +Model name = gpt2-large +[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... +[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 +[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl +[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +NCCL version 2.10.3+cuda11.3 +[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam... +Detected CUDA files, patching ldflags +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja... +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[1/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o +[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o +[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so +Loading extension module fused_adam... +Time to load fused_adam op: 19.252447843551636 seconds +[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... +Building extension module utils... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module utils... +Time to load utils op: 0.3341379165649414 seconds +[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB +[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.58 GB, percent = 1.3% +[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB +[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% +[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB +[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% +[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB +[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.53 GB, percent = 1.3% +[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB +[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.4 GB, percent = 1.3% +[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 7.38 GB Max_CA 7 GB +[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 7.38 GB Max_CA 7 GB +[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 16.04 GB Max_CA 16 GB +[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 16.04 GB Max_CA 16 GB +[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:45:48,972] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0004029273986816406 seconds +[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt +[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py +[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_save -- 10.13 GB, 7.51 secs, 1.35 gb/s +********************************************* +[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module fused_adam, skipping build step... +Loading extension module fused_adam... +Time to load fused_adam op: 0.0011363029479980469 seconds +[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.00023317337036132812 seconds +[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 16 GB +[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.55 GB, percent = 1.3% +[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.54 GB, percent = 1.3% +[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.51 GB, percent = 1.3% +[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% +[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% +[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% +[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.38 GB, percent = 1.3% +[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% +[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:45:57,937] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "mock", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.000377655029296875 seconds +[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} +stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} +[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py +[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_ds_mock_save -- 0.00 GB, 2.02 secs, 0.00 gb/s +********************************************* +[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module fused_adam, skipping build step... +Loading extension module fused_adam... +Time to load fused_adam op: 0.001192331314086914 seconds +[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0002560615539550781 seconds +[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB +[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.63 GB, percent = 1.3% +[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% +[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% +[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% +[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:46:02,176] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "python", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0003757476806640625 seconds +[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764} +stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899} +[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py +[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_ds_py_save -- 10.13 GB, 7.23 secs, 1.40 gb/s +********************************************* +[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module fused_adam, skipping build step... +Loading extension module fused_adam... +Time to load fused_adam op: 0.0010247230529785156 seconds +[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0002410411834716797 seconds +[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB +[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.6 GB, percent = 1.3% +[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.59 GB, percent = 1.3% +[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% +[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% +[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... +Building extension module async_io... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module async_io... +Time to load async_io op: 0.5492517948150635 seconds +[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:46:13,143] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "fast", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.00046539306640625 seconds +[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0002307891845703125 seconds +stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0003643035888671875 seconds +stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0} +[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py +[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_ds_fast_save -- 10.13 GB, 3.94 secs, 2.57 gb/s +********************************************* From e4817a1f8c9a8e1bda5618ce34465ce5adf9875c Mon Sep 17 00:00:00 2001 From: GuanhuaWang Date: Thu, 22 Sep 2022 01:23:54 +0000 Subject: [PATCH 19/40] add torch* error log with half flag but without fused flag --- .../log_9_21_22/torch_star_half_error.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt new file mode 100644 index 000000000..8d06a1011 --- /dev/null +++ b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt @@ -0,0 +1,13 @@ +Performance test of deepspeed integration of fast model checkpointing. +torch version = 1.12.0+cu113 +args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) +Model name = gpt2-large +[2022-09-22 01:22:52,520] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3 +[2022-09-22 01:22:52,524] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... +[2022-09-22 01:22:53,396] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 +[2022-09-22 01:22:53,397] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2022-09-22 01:22:53,400] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead +NCCL version 2.10.3+cuda11.3 +[2022-09-22 01:22:56,452] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2022-09-22 01:22:56,454] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-22 01:22:56,482] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} From b297e1776f8b9a266cc18852fd331e811b31f422 Mon Sep 17 00:00:00 2001 From: GuanhuaWang Date: Thu, 22 Sep 2022 01:31:01 +0000 Subject: [PATCH 20/40] log for error --- .../log_9_21_22/torch_star_half_error.txt | 75 +++++++++++++++++-- 1 file changed, 67 insertions(+), 8 deletions(-) diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt index 8d06a1011..5a5292f6e 100644 --- a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt +++ b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt @@ -2,12 +2,71 @@ Performance test of deepspeed integration of fast model checkpointing. torch version = 1.12.0+cu113 args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) Model name = gpt2-large -[2022-09-22 01:22:52,520] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3 -[2022-09-22 01:22:52,524] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... -[2022-09-22 01:22:53,396] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 -[2022-09-22 01:22:53,397] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl -[2022-09-22 01:22:53,400] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead +[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3 +[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... +-------------------------------------------------------------------------- +WARNING: No preset parameters were found for the device that Open MPI +detected: + + Local host: azwuse57c00009D + Device name: mlx5_ib0 + Device vendor ID: 0x02c9 + Device vendor part ID: 4124 + +Default device parameters will be used, which may result in lower +performance. You can edit any of the files specified by the +btl_openib_device_param_files MCA parameter to set values for your +device. + +NOTE: You can turn off this warning by setting the MCA parameter + btl_openib_warn_no_device_params_found to 0. +-------------------------------------------------------------------------- +-------------------------------------------------------------------------- +By default, for Open MPI 4.0 and later, infiniband ports on a device +are not used by default. The intent is to use UCX for these devices. +You can override this policy by setting the btl_openib_allow_ib MCA parameter +to true. + + Local host: azwuse57c00009D + Local adapter: mlx5_ib0 + Local port: 1 + +-------------------------------------------------------------------------- +-------------------------------------------------------------------------- +WARNING: There was an error initializing an OpenFabrics device. + + Local host: azwuse57c00009D + Local device: mlx5_ib4 +-------------------------------------------------------------------------- +[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 +[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead NCCL version 2.10.3+cuda11.3 -[2022-09-22 01:22:56,452] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False -[2022-09-22 01:22:56,454] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-22 01:22:56,482] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +Traceback (most recent call last): + File "deepspeed_save_model.py", line 133, in + main() + File "deepspeed_save_model.py", line 129, in main + run(model, model_name, ckpt_name, args) + File "deepspeed_save_model.py", line 106, in run + write_sec = test_save(tag, folder, model, args, writer_type) + File "deepspeed_save_model.py", line 76, in test_save + ds_engine = _get_ds_engine(model, ds_config) + File "deepspeed_save_model.py", line 52, in _get_ds_engine + ds_engine, _, _, _ = deepspeed.initialize( + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize + engine = DeepSpeedEngine(args=args, + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__ + self._configure_optimizer(optimizer, model_parameters) + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer + self.optimizer = self._configure_fp16_optimizer(basic_optimizer) + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer + or self.fp16_fused_mode() \ + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode + return self._config.fp16_fused_mode +AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode' +[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found +[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages +[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected From f05dab111f49e8f94f3163793e88fd197becb645 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 5 Oct 2022 17:58:55 +0000 Subject: [PATCH 21/40] local rank arg --- fast_io/model_checkpoint/deepspeed_save_model.py | 2 +- fast_io/model_checkpoint/save_model_utils.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index 081fe0299..cb0ec6009 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -102,7 +102,7 @@ def run(model, model_name, ckpt_name, args): for tag, writer_type in writer_dict.items(): folder = os.path.join(args.folder, ckpt_name, tag) if os.path.exists(folder): - shutil.rmtree(folder) + shutil.rmtree(folder, ignore_errors=True) write_sec = test_save(tag, folder, model, args, writer_type) ckpt_size = _get_folder_size(folder) gb_size = ckpt_size / (1024**3) diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index 24f9f87d1..02ea1942b 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -72,6 +72,11 @@ def parse_arguments(): required=True, help='Hugging Face transformers tag of model (e.g., gpt2).') + parser.add_argument('--local_rank', + type=int, + default=0, + help='Local rank' ) + parser.add_argument('--legacy', action='store_true', help='Use torch legacy save format') From 1aa971aabf9d3a42354914576ab5fb7ee19437b4 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 5 Oct 2022 15:53:02 -0400 Subject: [PATCH 22/40] Handle local_rank arg (#78) * save_checkpoint perf monitoring * Disable checkpoint save on exit * local rank arg --- fast_io/model_checkpoint/deepspeed_save_model.py | 2 +- fast_io/model_checkpoint/save_model_utils.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index 081fe0299..cb0ec6009 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -102,7 +102,7 @@ def run(model, model_name, ckpt_name, args): for tag, writer_type in writer_dict.items(): folder = os.path.join(args.folder, ckpt_name, tag) if os.path.exists(folder): - shutil.rmtree(folder) + shutil.rmtree(folder, ignore_errors=True) write_sec = test_save(tag, folder, model, args, writer_type) ckpt_size = _get_folder_size(folder) gb_size = ckpt_size / (1024**3) diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index 24f9f87d1..02ea1942b 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -72,6 +72,11 @@ def parse_arguments(): required=True, help='Hugging Face transformers tag of model (e.g., gpt2).') + parser.add_argument('--local_rank', + type=int, + default=0, + help='Local rank' ) + parser.add_argument('--legacy', action='store_true', help='Use torch legacy save format') From 98b2f8a4b7c4eb4e760fabda27b12b8816c0d53f Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 5 Oct 2022 21:00:03 +0000 Subject: [PATCH 23/40] Single writer option --- fast_io/model_checkpoint/deepspeed_save_model.py | 1 + fast_io/model_checkpoint/save_model_utils.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index cb0ec6009..44e1e66f5 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -43,6 +43,7 @@ def _get_ds_config(args, writer_type): "io_buffer_size": args.io_buffer_mb * (1024**2), "io_buffer_double": not args.single_io_buffer, "show_statistics": not args.no_statistics, + "data_parallel": not args.single_writer } return ds_config diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index 02ea1942b..af0b3d314 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -118,6 +118,9 @@ def parse_arguments(): action='store_true', help='Disable double buffering of i/o buffer.') + + parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state') + args = parser.parse_args() print(f'args = {args}') return args From 2e4228518f660f7151aceafe5ade6557e91b4aed Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 5 Oct 2022 17:02:10 -0400 Subject: [PATCH 24/40] Single writer option (#79) * save_checkpoint perf monitoring * Disable checkpoint save on exit * local rank arg * Single writer option --- fast_io/model_checkpoint/deepspeed_save_model.py | 1 + fast_io/model_checkpoint/save_model_utils.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index cb0ec6009..44e1e66f5 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -43,6 +43,7 @@ def _get_ds_config(args, writer_type): "io_buffer_size": args.io_buffer_mb * (1024**2), "io_buffer_double": not args.single_io_buffer, "show_statistics": not args.no_statistics, + "data_parallel": not args.single_writer } return ds_config diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index 02ea1942b..af0b3d314 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -118,6 +118,9 @@ def parse_arguments(): action='store_true', help='Disable double buffering of i/o buffer.') + + parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state') + args = parser.parse_args() print(f'args = {args}') return args From a567adf6a99f0a7c976993a50420b31f218125e9 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 12 Oct 2022 11:43:06 +0000 Subject: [PATCH 25/40] Allow missing folder --- fast_io/model_checkpoint/deepspeed_save_model.py | 8 +++++--- fast_io/model_checkpoint/requirements.txt | 1 + fast_io/model_checkpoint/save_model_utils.py | 6 +++--- 3 files changed, 9 insertions(+), 6 deletions(-) create mode 100644 fast_io/model_checkpoint/requirements.txt diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index 44e1e66f5..ffe0ff540 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -95,15 +95,17 @@ def _get_folder_size(folder): def run(model, model_name, ckpt_name, args): print(f'Model name = {model_name}') writer_dict = { - 'test_save': None, - 'test_ds_mock_save': 'mock', - 'test_ds_py_save': 'python', + # 'test_save': None, + # 'test_ds_mock_save': 'mock', + # 'test_ds_py_save': 'python', 'test_ds_fast_save': 'fast' } for tag, writer_type in writer_dict.items(): folder = os.path.join(args.folder, ckpt_name, tag) if os.path.exists(folder): shutil.rmtree(folder, ignore_errors=True) + # if not os.path.exists(folder): + # os.makedirs(folder, exist_ok=True) write_sec = test_save(tag, folder, model, args, writer_type) ckpt_size = _get_folder_size(folder) gb_size = ckpt_size / (1024**3) diff --git a/fast_io/model_checkpoint/requirements.txt b/fast_io/model_checkpoint/requirements.txt new file mode 100644 index 000000000..976a2b1f3 --- /dev/null +++ b/fast_io/model_checkpoint/requirements.txt @@ -0,0 +1 @@ +transformers diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index af0b3d314..9ab2859f3 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -41,9 +41,9 @@ def get_model(model_tag): def validate_arguments(args): success = True - if not os.path.exists(args.folder): - print(f'Invalid folder: {args.folder}') - success = False + # if not os.path.exists(args.folder): + # print(f'Invalid folder: {args.folder}') + # success = False if not args.model in HF_MODELS: print(f'{args.model} is not a supported HF model tag') From 65793bd13bcacf2df5c68ada006007a40f59d082 Mon Sep 17 00:00:00 2001 From: Tunji Ruwase Date: Fri, 10 Feb 2023 01:25:29 +0000 Subject: [PATCH 26/40] DP writer refactor --- fast_io/model_checkpoint/deepspeed_save_model.py | 6 +++--- fast_io/model_checkpoint/save_model_utils.py | 2 +- fast_io/model_checkpoint/torch_save_tensor.py | 13 +++++++++---- fast_io/model_checkpoint/torch_save_utils.py | 12 ++++++++---- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index ffe0ff540..70148e6ca 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -32,8 +32,8 @@ def _get_ds_config(args, writer_type): "block_size": 8 * (1024**2), "queue_depth": 8, "single_submit": False, - "overlap_events": False, - "thread_count": 1, + "overlap_events": True, + "thread_count": 2, } } @@ -43,7 +43,7 @@ def _get_ds_config(args, writer_type): "io_buffer_size": args.io_buffer_mb * (1024**2), "io_buffer_double": not args.single_io_buffer, "show_statistics": not args.no_statistics, - "data_parallel": not args.single_writer + "data_parallel": "socket" # None # not args.single_writer } return ds_config diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index 9ab2859f3..4101795f6 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -119,7 +119,7 @@ def parse_arguments(): help='Disable double buffering of i/o buffer.') - parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state') + #parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state') args = parser.parse_args() print(f'args = {args}') diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 80d5f1358..386d7156b 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -4,6 +4,7 @@ import os from torch_save_utils import PINNED_BUFFER_MB from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save +import deepspeed def run(args): @@ -14,9 +15,9 @@ def run(args): device=device) fn_dict = { - 'test_save': test_save, - 'test_ds_mock_save': test_ds_mock_save, - 'test_ds_py_save': test_ds_py_save, + # 'test_save': test_save, + # 'test_ds_mock_save': test_ds_mock_save, + # 'test_ds_py_save': test_ds_py_save, 'test_ds_fast_save': test_ds_fast_save } for tag, fn in fn_dict.items(): @@ -64,6 +65,10 @@ def parse_arguments(): parser.add_argument('--single_io_buffer', action='store_true', help='Disable double buffering of i/o buffer.') + parser.add_argument('--local_rank', + type=int, + default=0, + help='Local rank' ) args = parser.parse_args() print(f'args = {args}') @@ -78,7 +83,7 @@ def main(): if not os.path.exists(args.folder): print(f'Invalid folder: {args.folder}') quit() - + deepspeed.init_distributed() run(args) diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py index c01fd014c..e274b5cda 100644 --- a/fast_io/model_checkpoint/torch_save_utils.py +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -3,7 +3,7 @@ import os import deepspeed from deepspeed.ops.aio import AsyncIOBuilder -from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter +from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig AIO_QUEUE_DEPTH = 8 AIO_BLOCK_SIZE = 8 * (1024**2) @@ -62,10 +62,14 @@ def test_ds_fast_save(file, buffer, args): dtype=torch.uint8, device='cpu').pin_memory() st = time.time() + config = FastFileWriterConfig(aio_handle=h, + pinned_tensor=pinned_memory, + double_buffer=not args.single_io_buffer, + num_parallel_writers=1, + writer_rank=0) + ds_fast_writer = FastFileWriter(file_path=file, - aio_handle=h, - pinned_tensor=pinned_memory, - double_buffer=not args.single_io_buffer) + config=config) torch.save(f=ds_fast_writer, obj=buffer, _use_new_zipfile_serialization=not args.legacy) From 5bfdf04d65a66c2c42f094c9a680dc40925614ab Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 12 Feb 2025 11:59:23 -0500 Subject: [PATCH 27/40] Update for DS; Add GDS Signed-off-by: Olatunji Ruwase --- fast_io/model_checkpoint/save_model_utils.py | 54 ++++++++----------- fast_io/model_checkpoint/torch_save_model.py | 13 +++-- fast_io/model_checkpoint/torch_save_tensor.py | 13 +++-- fast_io/model_checkpoint/torch_save_utils.py | 45 +++++++++++++--- 4 files changed, 75 insertions(+), 50 deletions(-) diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index 4101795f6..739e31124 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -5,47 +5,35 @@ from torch_save_utils import PINNED_BUFFER_MB -def _get_gpt_j_6B(tag): - model_name = "EleutherAI/gpt-j-6B" - model = AutoModelForCausalLM.from_pretrained(model_name) - ckpt_name = "gpt-j-6B" - return model, model_name, ckpt_name - - -def _get_tiny_t5(tag): - model_name = "hf-internal-testing/tiny-random-t5" - model = T5ForConditionalGeneration.from_pretrained(model_name) - ckpt_name = "tiny-random-t5" - return model, model_name, ckpt_name - - -def _get_hf_gpt2(tag): - model_name = tag - model = AutoModelForCausalLM.from_pretrained(tag) - ckpt_name = tag - return model, model_name, ckpt_name - - -HF_MODELS = { - 'tiny-t5': _get_tiny_t5, - 'gpt-j-6B': _get_gpt_j_6B, - 'gpt2': _get_hf_gpt2, - 'gpt2-large': _get_hf_gpt2, - 'gpt2-xl': _get_hf_gpt2, +TINY_T5 = 'tiny-t5' +PHI3_MINI = 'phi3' +PHI3_VISION = 'phi3-v' +LLAMA3_1B = 'llama3-1B' + +HF_MODELS_DICT = { + TINY_T5: "hf-internal-testing/tiny-random-t5", + PHI3_MINI: "microsoft/Phi-3.5-mini-instruct", + PHI3_VISION: "microsoft/Phi-3.5-vision-instruct", + LLAMA3_1B: "meta-llama/Llama-3.2-1B", } +def _get_hf_model(tag): + model_name = HF_MODELS_DICT[tag] + if tag == TINY_T5: + model = T5ForConditionalGeneration.from_pretrained(model_name) + else: + model = AutoModelForCausalLM.from_pretrained(model_name) + + return model, model_name, tag def get_model(model_tag): - return HF_MODELS[model_tag](model_tag) + return _get_hf_model(model_tag) def validate_arguments(args): success = True - # if not os.path.exists(args.folder): - # print(f'Invalid folder: {args.folder}') - # success = False - if not args.model in HF_MODELS: + if not args.model in HF_MODELS_DICT: print(f'{args.model} is not a supported HF model tag') success = False @@ -70,7 +58,7 @@ def parse_arguments(): default=None, type=str, required=True, - help='Hugging Face transformers tag of model (e.g., gpt2).') + help=f'Hugging Face transformers tag of model. Available models = {list(HF_MODELS_DICT.keys())}') parser.add_argument('--local_rank', type=int, diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index 245d49e30..a489ee927 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -2,19 +2,22 @@ import torch from torch.optim import Adam import os -from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save +from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save from save_model_utils import get_model, validate_arguments, parse_arguments def run(model, model_name, ckpt_name, args): print(f'Model name = {model_name}') fn_dict = { - 'test_save': test_save, - 'test_ds_mock_save': test_ds_mock_save, - 'test_ds_py_save': test_ds_py_save, - 'test_ds_fast_save': test_ds_fast_save + # 'test_save': test_save, + # 'test_ds_mock_save': test_ds_mock_save, + # 'test_ds_py_save': test_ds_py_save, + # 'test_ds_aio_fast_save': test_ds_aio_fast_save, + 'test_ds_gds_fast_save': test_ds_gds_fast_save } for tag, fn in fn_dict.items(): + if tag == 'test_ds_gds_fast_save' and not args.gpu: + continue file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt') print(f'checkpoint file = {file}') if os.path.isfile(file): diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index 386d7156b..accbcd8b2 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -3,7 +3,7 @@ import torch import os from torch_save_utils import PINNED_BUFFER_MB -from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_fast_save +from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save import deepspeed @@ -15,12 +15,15 @@ def run(args): device=device) fn_dict = { - # 'test_save': test_save, - # 'test_ds_mock_save': test_ds_mock_save, - # 'test_ds_py_save': test_ds_py_save, - 'test_ds_fast_save': test_ds_fast_save + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, + 'test_ds_py_save': test_ds_py_save, + 'test_ds_aio_fast_save': test_ds_aio_fast_save, + 'test_ds_gds_fast_save': test_ds_gds_fast_save } for tag, fn in fn_dict.items(): + if tag == 'test_ds_gds_fast_save' and not args.gpu: + continue file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt') print(f'checkpoint file = {file}') if os.path.isfile(file): diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py index e274b5cda..166ce4582 100644 --- a/fast_io/model_checkpoint/torch_save_utils.py +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -2,12 +2,13 @@ import torch import os import deepspeed -from deepspeed.ops.aio import AsyncIOBuilder +from deepspeed.ops.op_builder import AsyncIOBuilder, GDSBuilder from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig +from deepspeed.accelerator import get_accelerator AIO_QUEUE_DEPTH = 8 AIO_BLOCK_SIZE = 8 * (1024**2) -AIO_THREAD_COUNT = 1 +AIO_INTRA_OP_PARALLEL = 1 AIO_SINGLE_SUBMIT = False AIO_OVERLAP_EVENTS = False PINNED_BUFFER_MB = 64 @@ -18,9 +19,16 @@ def _get_aio_handle(): queue_depth=AIO_QUEUE_DEPTH, single_submit=AIO_SINGLE_SUBMIT, overlap_events=AIO_SINGLE_SUBMIT, - num_threads=AIO_THREAD_COUNT) + intra_op_parallelism=AIO_INTRA_OP_PARALLEL) return h +def _get_gds_handle(): + h = GDSBuilder().load().gds_handle(block_size=AIO_BLOCK_SIZE, + queue_depth=AIO_QUEUE_DEPTH, + single_submit=AIO_SINGLE_SUBMIT, + overlap_events=AIO_SINGLE_SUBMIT, + intra_op_parallelism=AIO_INTRA_OP_PARALLEL) + return h def test_save(file, buffer, args): st = time.time() @@ -55,21 +63,37 @@ def test_ds_py_save(file, buffer, args): ds_py_writer._dump_state() return write_sec - -def test_ds_fast_save(file, buffer, args): +def _get_aio_components(args): h = _get_aio_handle() pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2), dtype=torch.uint8, device='cpu').pin_memory() + return h, pinned_memory + +def _get_gds_components(args): + h = _get_gds_handle() + pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), + dtype=torch.uint8, + device=get_accelerator().device_name()) + h.pin_device_tensor(pinned_memory) + return h, pinned_memory + + + +def _test_ds_fast_save(file, buffer, args, use_gds): + if use_gds: + h, pinned_memory = _get_gds_components(args) + else: + h, pinned_memory = _get_aio_components(args) st = time.time() - config = FastFileWriterConfig(aio_handle=h, + fast_writer_config = FastFileWriterConfig(aio_handle=h, pinned_tensor=pinned_memory, double_buffer=not args.single_io_buffer, num_parallel_writers=1, writer_rank=0) ds_fast_writer = FastFileWriter(file_path=file, - config=config) + config=fast_writer_config) torch.save(f=ds_fast_writer, obj=buffer, _use_new_zipfile_serialization=not args.legacy) @@ -78,3 +102,10 @@ def test_ds_fast_save(file, buffer, args): if not args.no_statistics: ds_fast_writer._dump_state() return write_sec + + +def test_ds_aio_fast_save(file, buffer, args): + return _test_ds_fast_save(file, buffer, args, False) + +def test_ds_gds_fast_save(file, buffer, args): + return _test_ds_fast_save(file, buffer, args, True) From 9a27914d5660b46a01765c8e50f74964b80fdbde Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 20 Feb 2025 07:17:04 -0500 Subject: [PATCH 28/40] Integrate GDS into deepspeed_model_save --- .../model_checkpoint/deepspeed_save_model.py | 23 +++++++++++-------- fast_io/model_checkpoint/save_model_utils.py | 4 +++- fast_io/model_checkpoint/torch_save_model.py | 17 +++++++------- fast_io/model_checkpoint/torch_save_tensor.py | 11 +++++---- fast_io/model_checkpoint/torch_save_utils.py | 8 +++---- 5 files changed, 35 insertions(+), 28 deletions(-) diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py index 70148e6ca..ea97dd717 100644 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -6,10 +6,10 @@ import random import numpy as np import deepspeed +from deepspeed.accelerator import get_accelerator from save_model_utils import get_model, validate_arguments, parse_arguments - -def _get_ds_config(args, writer_type): +def _get_ds_config(args, writer_type, use_gds): ds_config = { "train_micro_batch_size_per_gpu": 1, "zero_optimization": { @@ -33,7 +33,8 @@ def _get_ds_config(args, writer_type): "queue_depth": 8, "single_submit": False, "overlap_events": True, - "thread_count": 2, + "intra_op_parallelism": 2, + "use_gds": use_gds, } } @@ -69,11 +70,12 @@ def _free_ds_memory(ds_engine): ds_engine = None del ds_engine gc.collect() - torch.cuda.empty_cache() + get_accelerator().empty_cache() def test_save(tag, folder, model, args, writer_type): - ds_config = _get_ds_config(args, writer_type) + use_gds = writer_type == 'fast' and 'gds' in tag + ds_config = _get_ds_config(args, writer_type, use_gds) ds_engine = _get_ds_engine(model, ds_config) if args.zero_stage == 0: _do_optimizer_step(ds_engine) @@ -95,10 +97,11 @@ def _get_folder_size(folder): def run(model, model_name, ckpt_name, args): print(f'Model name = {model_name}') writer_dict = { - # 'test_save': None, - # 'test_ds_mock_save': 'mock', - # 'test_ds_py_save': 'python', - 'test_ds_fast_save': 'fast' + 'test_save': None, + 'test_ds_mock_save': 'mock', + 'test_ds_py_save': 'python', + 'test_ds_aio_fast_save': 'fast', + 'test_ds_gds_fast_save': 'fast', } for tag, writer_type in writer_dict.items(): folder = os.path.join(args.folder, ckpt_name, tag) @@ -111,7 +114,7 @@ def run(model, model_name, ckpt_name, args): gb_size = ckpt_size / (1024**3) gb_per_sec = gb_size / write_sec print( - f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s' + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' ) print(f'*********************************************') diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py index 739e31124..faf4fc5d8 100644 --- a/fast_io/model_checkpoint/save_model_utils.py +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -5,6 +5,7 @@ from torch_save_utils import PINNED_BUFFER_MB +GPT2L = 'gpt2-large' TINY_T5 = 'tiny-t5' PHI3_MINI = 'phi3' PHI3_VISION = 'phi3-v' @@ -12,6 +13,7 @@ HF_MODELS_DICT = { TINY_T5: "hf-internal-testing/tiny-random-t5", + GPT2L: GPT2L, PHI3_MINI: "microsoft/Phi-3.5-mini-instruct", PHI3_VISION: "microsoft/Phi-3.5-vision-instruct", LLAMA3_1B: "meta-llama/Llama-3.2-1B", @@ -58,7 +60,7 @@ def parse_arguments(): default=None, type=str, required=True, - help=f'Hugging Face transformers tag of model. Available models = {list(HF_MODELS_DICT.keys())}') + help=f'HuggingFace tag of model. Available models = {list(HF_MODELS_DICT.keys())}') parser.add_argument('--local_rank', type=int, diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py index a489ee927..6c1103049 100644 --- a/fast_io/model_checkpoint/torch_save_model.py +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -4,16 +4,18 @@ import os from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save from save_model_utils import get_model, validate_arguments, parse_arguments +import deepspeed +from deepspeed.accelerator import get_accelerator def run(model, model_name, ckpt_name, args): print(f'Model name = {model_name}') fn_dict = { - # 'test_save': test_save, - # 'test_ds_mock_save': test_ds_mock_save, - # 'test_ds_py_save': test_ds_py_save, - # 'test_ds_aio_fast_save': test_ds_aio_fast_save, - 'test_ds_gds_fast_save': test_ds_gds_fast_save + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, + 'test_ds_py_save': test_ds_py_save, + 'test_ds_gds_fast_save': test_ds_gds_fast_save, + 'test_ds_aio_fast_save': test_ds_aio_fast_save, } for tag, fn in fn_dict.items(): if tag == 'test_ds_gds_fast_save' and not args.gpu: @@ -28,14 +30,13 @@ def run(model, model_name, ckpt_name, args): gb_size = ckpt_size / (1024**3) gb_per_sec = gb_size / write_sec print( - f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s' + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' ) print(f'*********************************************') def _get_initialized_optimizer(model, fused_opt): base_optimizer = Adam(model.parameters()) - import deepspeed if fused_opt: from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper else: @@ -62,7 +63,7 @@ def main(): if args.half: model = model.half() if args.gpu: - model = model.cuda() + model = model.to(get_accelerator().current_device_name()) if args.optimizer: optimizer = _get_initialized_optimizer(model, args.fused) ckpt_state = {'model': model, 'optimizer': optimizer} diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py index accbcd8b2..014fdd035 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -5,19 +5,20 @@ from torch_save_utils import PINNED_BUFFER_MB from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save import deepspeed +from deepspeed.accelerator import get_accelerator def run(args): - device = torch.cuda.current_device() if args.gpu else 'cpu' + device = get_accelerator().current_device_name() if args.gpu else 'cpu' buffer = torch.randint(high=128, size=(args.mb_size * (1024**2), ), dtype=torch.uint8, device=device) fn_dict = { - 'test_save': test_save, - 'test_ds_mock_save': test_ds_mock_save, - 'test_ds_py_save': test_ds_py_save, + # 'test_save': test_save, + # 'test_ds_mock_save': test_ds_mock_save, + # 'test_ds_py_save': test_ds_py_save, 'test_ds_aio_fast_save': test_ds_aio_fast_save, 'test_ds_gds_fast_save': test_ds_gds_fast_save } @@ -33,7 +34,7 @@ def run(args): gb_per_sec = args.mb_size / (1024.0 * write_sec) gb_size = os.path.getsize(file) / (1024**3) print( - f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} gb/s' + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' ) print(f'*********************************************') diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py index 166ce4582..cf5f2bba5 100644 --- a/fast_io/model_checkpoint/torch_save_utils.py +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -15,7 +15,7 @@ def _get_aio_handle(): - h = AsyncIOBuilder().load().aio_handle(block_size=AIO_BLOCK_SIZE, + h = AsyncIOBuilder().load(verbose=False).aio_handle(block_size=AIO_BLOCK_SIZE, queue_depth=AIO_QUEUE_DEPTH, single_submit=AIO_SINGLE_SUBMIT, overlap_events=AIO_SINGLE_SUBMIT, @@ -23,7 +23,7 @@ def _get_aio_handle(): return h def _get_gds_handle(): - h = GDSBuilder().load().gds_handle(block_size=AIO_BLOCK_SIZE, + h = GDSBuilder().load(verbose=False).gds_handle(block_size=AIO_BLOCK_SIZE, queue_depth=AIO_QUEUE_DEPTH, single_submit=AIO_SINGLE_SUBMIT, overlap_events=AIO_SINGLE_SUBMIT, @@ -74,7 +74,7 @@ def _get_gds_components(args): h = _get_gds_handle() pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), dtype=torch.uint8, - device=get_accelerator().device_name()) + device=get_accelerator().current_device_name()) h.pin_device_tensor(pinned_memory) return h, pinned_memory @@ -86,7 +86,7 @@ def _test_ds_fast_save(file, buffer, args, use_gds): else: h, pinned_memory = _get_aio_components(args) st = time.time() - fast_writer_config = FastFileWriterConfig(aio_handle=h, + fast_writer_config = FastFileWriterConfig(dnvme_handle=h, pinned_tensor=pinned_memory, double_buffer=not args.single_io_buffer, num_parallel_writers=1, From 515dded20cac9147407e9a368b1c179be1598c93 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 25 Feb 2025 13:07:42 -0500 Subject: [PATCH 29/40] Rebase fast persist (#184) * Fast model checkpointing * Support both legacy and serialized formats * Add io_buffer_mb option * Bug fix * Force flush * More model options; Refactor common codes * --gpu option * --half and more flexible options * Add deepspeed.save_checkpoint() * Free ds memory * Improve repro * Double I/O buffer (#56) * Double I/O buffer (#60) * Add checkpoint comparison (#62) * Add checkpoint comparison * Corrected a typo Co-authored-by: Yang Li * save_checkpoint perf monitoring * Disable checkpoint save on exit * Perf statistics for save_checkpoint (#64) * save_checkpoint perf monitoring * Disable checkpoint save on exit * add logs for a100-80 * add torch* error log with half flag but without fused flag * log for error * local rank arg * Handle local_rank arg (#78) * save_checkpoint perf monitoring * Disable checkpoint save on exit * local rank arg * Single writer option * Single writer option (#79) * save_checkpoint perf monitoring * Disable checkpoint save on exit * local rank arg * Single writer option * Allow missing folder * DP writer refactor * Update for DS; Add GDS Signed-off-by: Olatunji Ruwase * Integrate GDS into deepspeed_model_save --------- Signed-off-by: Olatunji Ruwase Co-authored-by: jerryyangli Co-authored-by: Yang Li Co-authored-by: GuanhuaWang --- .../model_checkpoint/checkpoint_compare.py | 123 +++ .../model_checkpoint/deepspeed_save_model.py | 139 ++++ .../log_9_21_22/gpt2-unfused.txt | 599 ++++++++++++++ .../log_9_21_22/gpt2_fused_z2.txt | 781 ++++++++++++++++++ .../log_9_21_22/torch_star_half_error.txt | 72 ++ fast_io/model_checkpoint/requirements.txt | 1 + fast_io/model_checkpoint/save_model_utils.py | 116 +++ fast_io/model_checkpoint/torch_save_model.py | 76 ++ fast_io/model_checkpoint/torch_save_tensor.py | 95 +++ fast_io/model_checkpoint/torch_save_utils.py | 111 +++ 10 files changed, 2113 insertions(+) create mode 100644 fast_io/model_checkpoint/checkpoint_compare.py create mode 100644 fast_io/model_checkpoint/deepspeed_save_model.py create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt create mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt create mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt create mode 100644 fast_io/model_checkpoint/requirements.txt create mode 100644 fast_io/model_checkpoint/save_model_utils.py create mode 100644 fast_io/model_checkpoint/torch_save_model.py create mode 100644 fast_io/model_checkpoint/torch_save_tensor.py create mode 100644 fast_io/model_checkpoint/torch_save_utils.py diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/fast_io/model_checkpoint/checkpoint_compare.py new file mode 100644 index 000000000..cc67b61d9 --- /dev/null +++ b/fast_io/model_checkpoint/checkpoint_compare.py @@ -0,0 +1,123 @@ +#This script is for testing whether two checkpoints match; it prints all the differences + +import torch +import os +import sys +import pickle +from collections import OrderedDict + +exclude_key_str = {'ds_config/checkpoint/writer'} + +def main(): + dir1 = sys.argv[1] + dir2 = sys.argv[2] + print ("Begin comparison") + print ("The first directory {}" .format(dir1)) + print ("The second directory {}" .format(dir2)) + print (' ') + + file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))] + file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))] + common_files = [] + + for f in file_list1: + if not (f in file_list2): + log_error_file_mismatch_first(f) + else: + common_files.append(f) + for f in file_list2: + if not (f in file_list1): + log_error_file_mismatch_second(f) + + for f in common_files: + full_dir1 = os.path.join(dir1, f) + full_dir2 = os.path.join(dir2, f) + print ("Begin comparison") + print("The first checkpoint {}" .format(full_dir1)) + print("The second checkpoint {}" .format(full_dir2)) + print(' ') + model_first = torch.load(full_dir1) + model_second = torch.load(full_dir2) + object_compare(model_first, model_second, []) + + +def object_compare(model_first, model_second, key_chain): + if not (type(model_first) == type(model_second)): + log_error_value_mismatch(model_first, model_second, key_chain) + return + + if type(model_first) is list: + if len(model_first) != len(model_second): + log_error_value_mismatch(model_first, model_second, key_chain) + return + for i in range(len(model_first)): + object_compare(model_first[i], model_second[i], key_chain) + return + + if type(model_first) is dict or type(model_first) is OrderedDict: + common_keys = [] + for key in model_first: + if key not in model_second: + key_chain.append(key) + log_error_key_mismatch_first(model_first[key], key_chain) + key_chain.pop() + else: + common_keys.append(key) + + for key in model_second: + if key not in model_first: + key_chain.append(key) + log_error_key_mismatch_second(model_second[key], key_chain) + key_chain.pop() + + for key in common_keys: + key_chain.append(key) + object_compare(model_first[key], model_second[key], key_chain) + key_chain.pop() + return + + if hasattr(model_first, '__dict__'): + equality = (model_first.__dict__ == model_second.__dict__) + else: + equality = (model_first == model_second) + if type(equality) is not bool: + equality = (equality.all()) + if not equality: + log_error_value_mismatch(model_first, model_second, key_chain) + return + + +def log_error_file_mismatch_first(filename): + print("The following file appeared in the first but not the second directory: {}" .format(filename)) + print(' ') + + +def log_error_file_mismatch_second(filename): + print("The following key appeared in the second but not the first directory: {}" .format(filename)) + print(" ") + + +def log_error_key_mismatch_first(model, key_chain): + key_str = "/".join(key_chain) + if not (key_str in exclude_key_str): + print("The following key appeared in the first but not the second model: {}" .format(key_str)) + print("The value of the first model is: {}" .format(model)) + print(" ") + + +def log_error_key_mismatch_second(model, key_chain): + key_str = "/".join(key_chain) + if not (key_str in exclude_key_str): + print("The following key appeared in the second but not the first model: {}" .format(key_str)) + print("The value of the second model is: {}" .format(model)) + print(" ") + + +def log_error_value_mismatch(model_first, model_second, key_chain): + print ("The values of the following key do not match: {}" .format("/".join(key_chain))) + print ("The value of the first model is: {}" .format(model_first)) + print ("The value of the second model is: {}" .format(model_second)) + print(" ") + +if __name__ == "__main__": + main() diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py new file mode 100644 index 000000000..ea97dd717 --- /dev/null +++ b/fast_io/model_checkpoint/deepspeed_save_model.py @@ -0,0 +1,139 @@ +import time +import torch +import os +import shutil +import gc +import random +import numpy as np +import deepspeed +from deepspeed.accelerator import get_accelerator +from save_model_utils import get_model, validate_arguments, parse_arguments + +def _get_ds_config(args, writer_type, use_gds): + ds_config = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": args.zero_stage, + "cpu_offload": args.cpu_offload + }, + "fp16": { + "enabled": args.half + }, + "optimizer": { + "type": "Adam", + "params": { + "torch_adam": not args.fused + } + }, + "checkpoint": { + "checkpoint_serialization": not args.legacy + }, + "aio": { + "block_size": 8 * (1024**2), + "queue_depth": 8, + "single_submit": False, + "overlap_events": True, + "intra_op_parallelism": 2, + "use_gds": use_gds, + } + } + + if writer_type: + ds_config["checkpoint"]["writer"] = { + "type": writer_type, + "io_buffer_size": args.io_buffer_mb * (1024**2), + "io_buffer_double": not args.single_io_buffer, + "show_statistics": not args.no_statistics, + "data_parallel": "socket" # None # not args.single_writer + } + + return ds_config + + +def _get_ds_engine(model, ds_config): + ds_engine, _, _, _ = deepspeed.initialize( + model=model, model_parameters=model.parameters(), config=ds_config) + + return ds_engine + + +def _do_optimizer_step(ds_engine): + for p in ds_engine.module.parameters(): + p.grad = torch.zeros_like(p) + ds_engine.step() + + +def _free_ds_memory(ds_engine): + ds_engine.optimizer.optimizer = None + ds_engine.optimizer = None + ds_engine.module = None + ds_engine = None + del ds_engine + gc.collect() + get_accelerator().empty_cache() + + +def test_save(tag, folder, model, args, writer_type): + use_gds = writer_type == 'fast' and 'gds' in tag + ds_config = _get_ds_config(args, writer_type, use_gds) + ds_engine = _get_ds_engine(model, ds_config) + if args.zero_stage == 0: + _do_optimizer_step(ds_engine) + + st = time.time() + ds_engine.save_checkpoint(save_dir=folder, tag=tag) + write_sec = time.time() - st + _free_ds_memory(ds_engine) + return write_sec + + +def _get_folder_size(folder): + size = 0 + for path, _, files in os.walk(folder): + size += sum([os.path.getsize(os.path.join(path, f)) for f in files]) + return size + + +def run(model, model_name, ckpt_name, args): + print(f'Model name = {model_name}') + writer_dict = { + 'test_save': None, + 'test_ds_mock_save': 'mock', + 'test_ds_py_save': 'python', + 'test_ds_aio_fast_save': 'fast', + 'test_ds_gds_fast_save': 'fast', + } + for tag, writer_type in writer_dict.items(): + folder = os.path.join(args.folder, ckpt_name, tag) + if os.path.exists(folder): + shutil.rmtree(folder, ignore_errors=True) + # if not os.path.exists(folder): + # os.makedirs(folder, exist_ok=True) + write_sec = test_save(tag, folder, model, args, writer_type) + ckpt_size = _get_folder_size(folder) + gb_size = ckpt_size / (1024**3) + gb_per_sec = gb_size / write_sec + print( + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' + ) + print(f'*********************************************') + + +def main(): + print( + f'Performance test of deepspeed integration of fast model checkpointing.' + ) + print(f'torch version = {torch.__version__}') + torch.manual_seed(42) + np.random.seed(0) + random.seed(0) + args = parse_arguments() + if not validate_arguments(args): + quit() + + model, model_name, ckpt_name = get_model(args.model) + run(model, model_name, ckpt_name, args) + + +if __name__ == "__main__": + main() diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt new file mode 100644 index 000000000..33985e8db --- /dev/null +++ b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt @@ -0,0 +1,599 @@ +Performance test of deepspeed integration of fast model checkpointing. +torch version = 1.12.0+cu113 +args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) +Model name = gpt2-large +[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... +[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 +[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl +[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +NCCL version 2.10.3+cuda11.3 +[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:22,130] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... +Building extension module utils... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module utils... +Time to load utils op: 0.3399326801300049 seconds +[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt +test_save -- 10.13 GB, 6.83 secs, 1.48 gb/s +********************************************* +[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:30,659] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "mock", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0004949569702148438 seconds +[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} +test_ds_mock_save -- 0.00 GB, 0.93 secs, 0.00 gb/s +********************************************* +[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:33,250] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "python", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.000392913818359375 seconds +[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196} +test_ds_py_save -- 10.13 GB, 6.32 secs, 1.60 gb/s +********************************************* +[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW +[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale +[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... +Building extension module async_io... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module async_io... +Time to load async_io op: 0.4869067668914795 seconds +[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_config .................. { + "stage": 0, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_enabled ................. False +[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 +[2022-09-21 18:42:41,332] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 0, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "fast", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0004849433898925781 seconds +[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0003745555877685547 seconds +stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0} +test_ds_fast_save -- 10.13 GB, 3.00 secs, 3.38 gb/s +********************************************* diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt new file mode 100644 index 000000000..9871b634e --- /dev/null +++ b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt @@ -0,0 +1,781 @@ +Performance test of deepspeed integration of fast model checkpointing. +torch version = 1.12.0+cu113 +args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2) +Model name = gpt2-large +[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... +[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 +[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl +[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +NCCL version 2.10.3+cuda11.3 +[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam... +Detected CUDA files, patching ldflags +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja... +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +[1/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o +[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o +[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so +Loading extension module fused_adam... +Time to load fused_adam op: 19.252447843551636 seconds +[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... +Building extension module utils... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module utils... +Time to load utils op: 0.3341379165649414 seconds +[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB +[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.58 GB, percent = 1.3% +[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB +[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% +[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB +[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% +[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB +[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.53 GB, percent = 1.3% +[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB +[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.4 GB, percent = 1.3% +[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 7.38 GB Max_CA 7 GB +[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 7.38 GB Max_CA 7 GB +[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 16.04 GB Max_CA 16 GB +[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 16.04 GB Max_CA 16 GB +[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:45:48,972] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0004029273986816406 seconds +[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt +[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py +[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_save -- 10.13 GB, 7.51 secs, 1.35 gb/s +********************************************* +[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module fused_adam, skipping build step... +Loading extension module fused_adam... +Time to load fused_adam op: 0.0011363029479980469 seconds +[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.00023317337036132812 seconds +[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 16 GB +[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.55 GB, percent = 1.3% +[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.54 GB, percent = 1.3% +[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.51 GB, percent = 1.3% +[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% +[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% +[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% +[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% +[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.38 GB, percent = 1.3% +[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% +[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:45:57,937] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "mock", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.000377655029296875 seconds +[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} +stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} +[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py +[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_ds_mock_save -- 0.00 GB, 2.02 secs, 0.00 gb/s +********************************************* +[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module fused_adam, skipping build step... +Loading extension module fused_adam... +Time to load fused_adam op: 0.001192331314086914 seconds +[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0002560615539550781 seconds +[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB +[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.63 GB, percent = 1.3% +[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% +[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% +[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% +[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% +[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:46:02,176] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "python", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0003757476806640625 seconds +[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt +stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764} +stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899} +[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py +[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_ds_py_save -- 10.13 GB, 7.23 secs, 1.40 gb/s +********************************************* +[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 +[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 +[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module fused_adam, skipping build step... +Loading extension module fused_adam... +Time to load fused_adam op: 0.0010247230529785156 seconds +[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam +Checking ZeRO support for optimizer=FusedAdam type= +[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer +[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 +[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 +[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0002410411834716797 seconds +[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU +[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB +[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.6 GB, percent = 1.3% +[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU +[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.59 GB, percent = 1.3% +[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU +[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% +[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache +[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% +[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 +[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB +[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 +[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states +[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB +[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states +[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized +[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer +[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB +[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% +[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam +[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None +[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... +Building extension module async_io... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module async_io... +Time to load async_io op: 0.5492517948150635 seconds +[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration: +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_enabled .................. False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_params ................... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] disable_allgather ............ False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dump_state ................... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_enabled ........... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_verbose ........... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] elasticity_enabled ........... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_enabled ................. True +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] global_rank .................. 0 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 +[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] loss_scale ................... 0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] memory_breakdown ............. False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_name ............... adam +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_params ............. {} +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_enabled .................. False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_params ................... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] prescale_gradients ........... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_groups .............. 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_offset .............. 1000 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_period .............. 1000 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_rounding ............ 0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_start_bits .......... 16 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_target_bits ......... 8 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_training_enabled .... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_type ................ 0 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_verbose ............. False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_name ............... None +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_params ............. None +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_attention ............. None +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] steps_per_print .............. 10 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_enabled .......... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_output_path ...... +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_batch_size ............. 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] use_quantizer_kernel ......... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] wall_clock_breakdown ......... False +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] world_size ................... 1 +[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] zero_allow_untested_optimizer False +[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_config .................. { + "stage": 2, + "contiguous_gradients": false, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+12, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "legacy_stage1": false +} +[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_enabled ................. True +[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 +[2022-09-21 18:46:13,143] [INFO] [config.py:888:print] json = { + "train_micro_batch_size_per_gpu": 1, + "zero_optimization": { + "stage": 2, + "cpu_offload": false + }, + "fp16": { + "enabled": true + }, + "optimizer": { + "type": "Adam", + "params": { + } + }, + "checkpoint": { + "checkpoint_serialization": false, + "writer": { + "type": "fast", + "io_buffer_size": 1.073742e+09, + "io_buffer_double": false, + "show_statistics": true + } + }, + "aio": { + "block_size": 8.388608e+06, + "queue_depth": 8, + "single_submit": false, + "overlap_events": false, + "thread_count": 1 + } +} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.00046539306640625 seconds +[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0002307891845703125 seconds +stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0} +Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0003643035888671875 seconds +stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0} +[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py +[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt +test_ds_fast_save -- 10.13 GB, 3.94 secs, 2.57 gb/s +********************************************* diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt new file mode 100644 index 000000000..5a5292f6e --- /dev/null +++ b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt @@ -0,0 +1,72 @@ +Performance test of deepspeed integration of fast model checkpointing. +torch version = 1.12.0+cu113 +args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) +Model name = gpt2-large +[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3 +[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... +-------------------------------------------------------------------------- +WARNING: No preset parameters were found for the device that Open MPI +detected: + + Local host: azwuse57c00009D + Device name: mlx5_ib0 + Device vendor ID: 0x02c9 + Device vendor part ID: 4124 + +Default device parameters will be used, which may result in lower +performance. You can edit any of the files specified by the +btl_openib_device_param_files MCA parameter to set values for your +device. + +NOTE: You can turn off this warning by setting the MCA parameter + btl_openib_warn_no_device_params_found to 0. +-------------------------------------------------------------------------- +-------------------------------------------------------------------------- +By default, for Open MPI 4.0 and later, infiniband ports on a device +are not used by default. The intent is to use UCX for these devices. +You can override this policy by setting the btl_openib_allow_ib MCA parameter +to true. + + Local host: azwuse57c00009D + Local adapter: mlx5_ib0 + Local port: 1 + +-------------------------------------------------------------------------- +-------------------------------------------------------------------------- +WARNING: There was an error initializing an OpenFabrics device. + + Local host: azwuse57c00009D + Local device: mlx5_ib4 +-------------------------------------------------------------------------- +[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 +[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead +NCCL version 2.10.3+cuda11.3 +[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer +[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} +Traceback (most recent call last): + File "deepspeed_save_model.py", line 133, in + main() + File "deepspeed_save_model.py", line 129, in main + run(model, model_name, ckpt_name, args) + File "deepspeed_save_model.py", line 106, in run + write_sec = test_save(tag, folder, model, args, writer_type) + File "deepspeed_save_model.py", line 76, in test_save + ds_engine = _get_ds_engine(model, ds_config) + File "deepspeed_save_model.py", line 52, in _get_ds_engine + ds_engine, _, _, _ = deepspeed.initialize( + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize + engine = DeepSpeedEngine(args=args, + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__ + self._configure_optimizer(optimizer, model_parameters) + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer + self.optimizer = self._configure_fp16_optimizer(basic_optimizer) + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer + or self.fp16_fused_mode() \ + File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode + return self._config.fp16_fused_mode +AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode' +[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found +[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages +[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected diff --git a/fast_io/model_checkpoint/requirements.txt b/fast_io/model_checkpoint/requirements.txt new file mode 100644 index 000000000..976a2b1f3 --- /dev/null +++ b/fast_io/model_checkpoint/requirements.txt @@ -0,0 +1 @@ +transformers diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py new file mode 100644 index 000000000..faf4fc5d8 --- /dev/null +++ b/fast_io/model_checkpoint/save_model_utils.py @@ -0,0 +1,116 @@ +import argparse +import os +from transformers import AutoModelForCausalLM +from transformers import T5ForConditionalGeneration +from torch_save_utils import PINNED_BUFFER_MB + + +GPT2L = 'gpt2-large' +TINY_T5 = 'tiny-t5' +PHI3_MINI = 'phi3' +PHI3_VISION = 'phi3-v' +LLAMA3_1B = 'llama3-1B' + +HF_MODELS_DICT = { + TINY_T5: "hf-internal-testing/tiny-random-t5", + GPT2L: GPT2L, + PHI3_MINI: "microsoft/Phi-3.5-mini-instruct", + PHI3_VISION: "microsoft/Phi-3.5-vision-instruct", + LLAMA3_1B: "meta-llama/Llama-3.2-1B", +} + +def _get_hf_model(tag): + model_name = HF_MODELS_DICT[tag] + if tag == TINY_T5: + model = T5ForConditionalGeneration.from_pretrained(model_name) + else: + model = AutoModelForCausalLM.from_pretrained(model_name) + + return model, model_name, tag + +def get_model(model_tag): + return _get_hf_model(model_tag) + + +def validate_arguments(args): + success = True + + if not args.model in HF_MODELS_DICT: + print(f'{args.model} is not a supported HF model tag') + success = False + + if args.optimizer and args.half: + if not args.gpu: + print(f'mixed precision only supported with gpu tensors') + success = False + + return success + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--folder', + default=None, + type=str, + required=True, + help='Folder to use for I/O.') + + parser.add_argument( + '--model', + default=None, + type=str, + required=True, + help=f'HuggingFace tag of model. Available models = {list(HF_MODELS_DICT.keys())}') + + parser.add_argument('--local_rank', + type=int, + default=0, + help='Local rank' ) + + parser.add_argument('--legacy', + action='store_true', + help='Use torch legacy save format') + + parser.add_argument('--optimizer', + action='store_true', + help='Include optimizer state in checkpoint.') + + parser.add_argument('--fused', + action='store_true', + help='Use fused fp16 optimizer.') + + parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') + + parser.add_argument('--half', + action='store_true', + help='Use half-precision tensors.') + + parser.add_argument( + '--io_buffer_mb', + type=int, + default=PINNED_BUFFER_MB, + help=f'Size of pinned i/o buffer in MB. Default = {PINNED_BUFFER_MB}') + + parser.add_argument('--zero_stage', + type=int, + default=0, + help='ZeRO optimization stage. Default = 0') + + parser.add_argument('--cpu_offload', + action='store_true', + help='Enable CPU offload of optimizer state.') + + parser.add_argument('--no-statistics', + action='store_true', + help='Suppress low-level performance statistics.') + + parser.add_argument('--single_io_buffer', + action='store_true', + help='Disable double buffering of i/o buffer.') + + + #parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state') + + args = parser.parse_args() + print(f'args = {args}') + return args diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py new file mode 100644 index 000000000..6c1103049 --- /dev/null +++ b/fast_io/model_checkpoint/torch_save_model.py @@ -0,0 +1,76 @@ +import time +import torch +from torch.optim import Adam +import os +from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save +from save_model_utils import get_model, validate_arguments, parse_arguments +import deepspeed +from deepspeed.accelerator import get_accelerator + + +def run(model, model_name, ckpt_name, args): + print(f'Model name = {model_name}') + fn_dict = { + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, + 'test_ds_py_save': test_ds_py_save, + 'test_ds_gds_fast_save': test_ds_gds_fast_save, + 'test_ds_aio_fast_save': test_ds_aio_fast_save, + } + for tag, fn in fn_dict.items(): + if tag == 'test_ds_gds_fast_save' and not args.gpu: + continue + file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt') + print(f'checkpoint file = {file}') + if os.path.isfile(file): + os.remove(file) + st = time.time() + write_sec = fn(file, model, args) + ckpt_size = os.path.getsize(file) + gb_size = ckpt_size / (1024**3) + gb_per_sec = gb_size / write_sec + print( + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' + ) + print(f'*********************************************') + + +def _get_initialized_optimizer(model, fused_opt): + base_optimizer = Adam(model.parameters()) + if fused_opt: + from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper + else: + from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper + optimizer = FP16_Wrapper(base_optimizer) + for p in model.parameters(): + p.grad = torch.zeros_like(p) + optimizer.step() + return optimizer + + +def main(): + print( + f'Performance test of torch.save() integration of fast model checkpointing.' + ) + print(f'torch version = {torch.__version__}') + torch.manual_seed(42) + + args = parse_arguments() + if not validate_arguments(args): + quit() + + model, model_name, ckpt_name = get_model(args.model) + if args.half: + model = model.half() + if args.gpu: + model = model.to(get_accelerator().current_device_name()) + if args.optimizer: + optimizer = _get_initialized_optimizer(model, args.fused) + ckpt_state = {'model': model, 'optimizer': optimizer} + else: + ckpt_state = {'model': model} + run(ckpt_state, model_name, ckpt_name, args) + + +if __name__ == "__main__": + main() diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py new file mode 100644 index 000000000..014fdd035 --- /dev/null +++ b/fast_io/model_checkpoint/torch_save_tensor.py @@ -0,0 +1,95 @@ +import time +import argparse +import torch +import os +from torch_save_utils import PINNED_BUFFER_MB +from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save +import deepspeed +from deepspeed.accelerator import get_accelerator + + +def run(args): + device = get_accelerator().current_device_name() if args.gpu else 'cpu' + buffer = torch.randint(high=128, + size=(args.mb_size * (1024**2), ), + dtype=torch.uint8, + device=device) + + fn_dict = { + # 'test_save': test_save, + # 'test_ds_mock_save': test_ds_mock_save, + # 'test_ds_py_save': test_ds_py_save, + 'test_ds_aio_fast_save': test_ds_aio_fast_save, + 'test_ds_gds_fast_save': test_ds_gds_fast_save + } + for tag, fn in fn_dict.items(): + if tag == 'test_ds_gds_fast_save' and not args.gpu: + continue + file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt') + print(f'checkpoint file = {file}') + if os.path.isfile(file): + os.remove(file) + st = time.time() + write_sec = fn(file, buffer, args) + gb_per_sec = args.mb_size / (1024.0 * write_sec) + gb_size = os.path.getsize(file) / (1024**3) + print( + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' + ) + print(f'*********************************************') + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--folder', + default=None, + type=str, + required=True, + help='Folder to use for I/O.') + parser.add_argument('--mb_size', + type=int, + default=None, + required=True, + help='Size of tensor to save in MB.') + parser.add_argument('--legacy', + action='store_true', + help='Use torch legacy save format') + + parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') + + parser.add_argument('--io_buffer_mb', + type=int, + default=PINNED_BUFFER_MB, + help='Size of pinned i/o buffer in MB.') + + parser.add_argument('--no-statistics', + action='store_true', + help='Suppress low-level performance statistics.') + + parser.add_argument('--single_io_buffer', + action='store_true', + help='Disable double buffering of i/o buffer.') + parser.add_argument('--local_rank', + type=int, + default=0, + help='Local rank' ) + + args = parser.parse_args() + print(f'args = {args}') + return args + + +def main(): + print( + f'Performance test of torch.save() integration of fast tensor checkpointing.' + ) + args = parse_arguments() + if not os.path.exists(args.folder): + print(f'Invalid folder: {args.folder}') + quit() + deepspeed.init_distributed() + run(args) + + +if __name__ == "__main__": + main() diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py new file mode 100644 index 000000000..cf5f2bba5 --- /dev/null +++ b/fast_io/model_checkpoint/torch_save_utils.py @@ -0,0 +1,111 @@ +import time +import torch +import os +import deepspeed +from deepspeed.ops.op_builder import AsyncIOBuilder, GDSBuilder +from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig +from deepspeed.accelerator import get_accelerator + +AIO_QUEUE_DEPTH = 8 +AIO_BLOCK_SIZE = 8 * (1024**2) +AIO_INTRA_OP_PARALLEL = 1 +AIO_SINGLE_SUBMIT = False +AIO_OVERLAP_EVENTS = False +PINNED_BUFFER_MB = 64 + + +def _get_aio_handle(): + h = AsyncIOBuilder().load(verbose=False).aio_handle(block_size=AIO_BLOCK_SIZE, + queue_depth=AIO_QUEUE_DEPTH, + single_submit=AIO_SINGLE_SUBMIT, + overlap_events=AIO_SINGLE_SUBMIT, + intra_op_parallelism=AIO_INTRA_OP_PARALLEL) + return h + +def _get_gds_handle(): + h = GDSBuilder().load(verbose=False).gds_handle(block_size=AIO_BLOCK_SIZE, + queue_depth=AIO_QUEUE_DEPTH, + single_submit=AIO_SINGLE_SUBMIT, + overlap_events=AIO_SINGLE_SUBMIT, + intra_op_parallelism=AIO_INTRA_OP_PARALLEL) + return h + +def test_save(file, buffer, args): + st = time.time() + torch.save(f=file, + obj=buffer, + _use_new_zipfile_serialization=not args.legacy) + return time.time() - st + + +def test_ds_mock_save(file, buffer, args): + st = time.time() + ds_mock_writer = MockFileWriter(file) + torch.save(f=ds_mock_writer, + obj=buffer, + _use_new_zipfile_serialization=not args.legacy) + ds_mock_writer.close() # Force flush to storage + write_sec = time.time() - st + if not args.no_statistics: + ds_mock_writer._dump_state() + return write_sec + + +def test_ds_py_save(file, buffer, args): + st = time.time() + ds_py_writer = PyFileWriter(file) + torch.save(f=ds_py_writer, + obj=buffer, + _use_new_zipfile_serialization=not args.legacy) + ds_py_writer.close() # Force flush to storage + write_sec = time.time() - st + if not args.no_statistics: + ds_py_writer._dump_state() + return write_sec + +def _get_aio_components(args): + h = _get_aio_handle() + pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2), + dtype=torch.uint8, + device='cpu').pin_memory() + return h, pinned_memory + +def _get_gds_components(args): + h = _get_gds_handle() + pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), + dtype=torch.uint8, + device=get_accelerator().current_device_name()) + h.pin_device_tensor(pinned_memory) + return h, pinned_memory + + + +def _test_ds_fast_save(file, buffer, args, use_gds): + if use_gds: + h, pinned_memory = _get_gds_components(args) + else: + h, pinned_memory = _get_aio_components(args) + st = time.time() + fast_writer_config = FastFileWriterConfig(dnvme_handle=h, + pinned_tensor=pinned_memory, + double_buffer=not args.single_io_buffer, + num_parallel_writers=1, + writer_rank=0) + + ds_fast_writer = FastFileWriter(file_path=file, + config=fast_writer_config) + torch.save(f=ds_fast_writer, + obj=buffer, + _use_new_zipfile_serialization=not args.legacy) + ds_fast_writer.close() # Force flush to storage + write_sec = time.time() - st + if not args.no_statistics: + ds_fast_writer._dump_state() + return write_sec + + +def test_ds_aio_fast_save(file, buffer, args): + return _test_ds_fast_save(file, buffer, args, False) + +def test_ds_gds_fast_save(file, buffer, args): + return _test_ds_fast_save(file, buffer, args, True) From d01aa278c7676f16b17c2ba2ee4a1a46b431f3a3 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 26 Mar 2025 09:36:09 -0400 Subject: [PATCH 30/40] Move folder Signed-off-by: Olatunji Ruwase --- deepnvme/file_access/aio_load_cpu_tensor.py | 11 +- deepnvme/file_access/aio_store_cpu_tensor.py | 10 +- .../model_checkpoint/checkpoint_compare.py | 0 .../model_checkpoint/deepspeed_save_model.py | 0 deepnvme/model_checkpoint/local_cufile.json | 1 + .../model_checkpoint/requirements.txt | 0 .../model_checkpoint/save_model_utils.py | 0 .../model_checkpoint/torch_save_model.py | 0 .../model_checkpoint/torch_save_tensor.py | 52 +- .../model_checkpoint/torch_save_utils.py | 0 .../log_9_21_22/gpt2-unfused.txt | 599 -------------- .../log_9_21_22/gpt2_fused_z2.txt | 781 ------------------ .../log_9_21_22/torch_star_half_error.txt | 72 -- 13 files changed, 68 insertions(+), 1458 deletions(-) rename {fast_io => deepnvme}/model_checkpoint/checkpoint_compare.py (100%) rename {fast_io => deepnvme}/model_checkpoint/deepspeed_save_model.py (100%) create mode 100644 deepnvme/model_checkpoint/local_cufile.json rename {fast_io => deepnvme}/model_checkpoint/requirements.txt (100%) rename {fast_io => deepnvme}/model_checkpoint/save_model_utils.py (100%) rename {fast_io => deepnvme}/model_checkpoint/torch_save_model.py (100%) rename {fast_io => deepnvme}/model_checkpoint/torch_save_tensor.py (65%) rename {fast_io => deepnvme}/model_checkpoint/torch_save_utils.py (100%) delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt delete mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt diff --git a/deepnvme/file_access/aio_load_cpu_tensor.py b/deepnvme/file_access/aio_load_cpu_tensor.py index 27a1e61c5..d6f767231 100644 --- a/deepnvme/file_access/aio_load_cpu_tensor.py +++ b/deepnvme/file_access/aio_load_cpu_tensor.py @@ -2,6 +2,7 @@ import os, timeit, functools from deepspeed.ops.op_builder import AsyncIOBuilder from utils import parse_read_arguments, GIGA_UNIT +from deepspeed.accelerator import get_accelerator def file_read(inp_f, handle, bounce_buffer): handle.sync_pread(bounce_buffer, inp_f) @@ -14,7 +15,12 @@ def main(): cnt = args.loop aio_handle = AsyncIOBuilder().load().aio_handle() - bounce_buffer = torch.empty(os.path.getsize(input_file), dtype=torch.uint8).pin_memory() + native_locked_tensor = get_accelerator()._name == 'cpu' + + if native_locked_tensor: + bounce_buffer = aio_handle.new_cpu_locked_tensor(file_sz, torch.Tensor().to(torch.uint8)) + else: + bounce_buffer = torch.empty(file_sz, dtype=torch.uint8).pin_memory() t = timeit.Timer(functools.partial(file_read, input_file, aio_handle, bounce_buffer)) aio_t = t.timeit(cnt) @@ -27,5 +33,8 @@ def main(): py_tensor = py_file_read(input_file) print(f'Validation success = {aio_tensor.equal(py_tensor)}') + if native_locked_tensor: + aio_handle.free_cpu_locked_tensor(bounce_buffer) + if __name__ == "__main__": main() diff --git a/deepnvme/file_access/aio_store_cpu_tensor.py b/deepnvme/file_access/aio_store_cpu_tensor.py index 20c03792b..5cdd6f68b 100644 --- a/deepnvme/file_access/aio_store_cpu_tensor.py +++ b/deepnvme/file_access/aio_store_cpu_tensor.py @@ -2,6 +2,7 @@ import os, timeit, functools, pathlib from deepspeed.ops.op_builder import AsyncIOBuilder from utils import parse_write_arguments, GIGA_UNIT +from deepspeed.accelerator import get_accelerator def file_write(out_f, tensor, handle, bounce_buffer): bounce_buffer.copy_(tensor) @@ -14,9 +15,13 @@ def main(): pathlib.Path(output_file).unlink(missing_ok=True) file_sz = args.mb_size*(1024**2) app_tensor = torch.empty(file_sz, dtype=torch.uint8, device='cpu', requires_grad=False) + native_locked_tensor = get_accelerator()._name == 'cpu' aio_handle = AsyncIOBuilder().load().aio_handle() - bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory() + if native_locked_tensor: + bounce_buffer = aio_handle.new_cpu_locked_tensor(file_sz, torch.Tensor().to(torch.uint8)) + else: + bounce_buffer = torch.empty(file_sz, dtype=torch.uint8, requires_grad=False).pin_memory() t = timeit.Timer(functools.partial(file_write, output_file, app_tensor, aio_handle, bounce_buffer)) @@ -33,6 +38,9 @@ def main(): filecmp.clear_cache() print(f'Validation success = {filecmp.cmp(py_ref_file, output_file, shallow=False) }') + if native_locked_tensor: + aio_handle.free_cpu_locked_tensor(bounce_buffer) + pathlib.Path(output_file).unlink(missing_ok=True) diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/deepnvme/model_checkpoint/checkpoint_compare.py similarity index 100% rename from fast_io/model_checkpoint/checkpoint_compare.py rename to deepnvme/model_checkpoint/checkpoint_compare.py diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/deepnvme/model_checkpoint/deepspeed_save_model.py similarity index 100% rename from fast_io/model_checkpoint/deepspeed_save_model.py rename to deepnvme/model_checkpoint/deepspeed_save_model.py diff --git a/deepnvme/model_checkpoint/local_cufile.json b/deepnvme/model_checkpoint/local_cufile.json new file mode 100644 index 000000000..7d4d9c8e3 --- /dev/null +++ b/deepnvme/model_checkpoint/local_cufile.json @@ -0,0 +1 @@ +{"execution": {"max_io_queue_depth": 8, "max_request_parallelism": 1, "max_io_threads": 1, "parallel_io": true, "min_io_threshold_size_kb": 8192}} \ No newline at end of file diff --git a/fast_io/model_checkpoint/requirements.txt b/deepnvme/model_checkpoint/requirements.txt similarity index 100% rename from fast_io/model_checkpoint/requirements.txt rename to deepnvme/model_checkpoint/requirements.txt diff --git a/fast_io/model_checkpoint/save_model_utils.py b/deepnvme/model_checkpoint/save_model_utils.py similarity index 100% rename from fast_io/model_checkpoint/save_model_utils.py rename to deepnvme/model_checkpoint/save_model_utils.py diff --git a/fast_io/model_checkpoint/torch_save_model.py b/deepnvme/model_checkpoint/torch_save_model.py similarity index 100% rename from fast_io/model_checkpoint/torch_save_model.py rename to deepnvme/model_checkpoint/torch_save_model.py diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/deepnvme/model_checkpoint/torch_save_tensor.py similarity index 65% rename from fast_io/model_checkpoint/torch_save_tensor.py rename to deepnvme/model_checkpoint/torch_save_tensor.py index 014fdd035..381deebde 100644 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ b/deepnvme/model_checkpoint/torch_save_tensor.py @@ -6,9 +6,51 @@ from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save import deepspeed from deepspeed.accelerator import get_accelerator +import deepspeed.comm as dist +import multiprocessing as mp +import os +FUNC_DICT = { + # 'test_save': test_save, + # 'test_ds_mock_save': test_ds_mock_save, + # 'test_ds_py_save': test_ds_py_save, + 'test_ds_gds_fast_save': test_ds_gds_fast_save, + # 'test_ds_aio_fast_save': test_ds_aio_fast_save, +} def run(args): + + for tag, fn in FUNC_DICT.items(): + if tag == 'test_ds_gds_fast_save' and not args.gpu: + continue + print(f"launching {tag=} from {os.getpid()=}") + mp.set_start_method('spawn', force=True) + run_save_method(tag, args) + + +def run_save_method(tag, args): + print(f"running {tag=} from {os.getpid()=}") + device = get_accelerator().current_device_name() if args.gpu else 'cpu' + buffer = torch.randint(high=128, + size=(args.mb_size * (1024**2), ), + dtype=torch.uint8, + device=device) + + file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt') + print(f'checkpoint file = {file}') + if os.path.isfile(file): + os.remove(file) + st = time.time() + write_sec = FUNC_DICT[tag](file, buffer, args) + gb_per_sec = args.mb_size / (1024.0 * write_sec) + gb_size = os.path.getsize(file) / (1024**3) + print( + f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' + ) + print(f'*********************************************') + + +def old_run(args): device = get_accelerator().current_device_name() if args.gpu else 'cpu' buffer = torch.randint(high=128, size=(args.mb_size * (1024**2), ), @@ -16,10 +58,10 @@ def run(args): device=device) fn_dict = { - # 'test_save': test_save, - # 'test_ds_mock_save': test_ds_mock_save, - # 'test_ds_py_save': test_ds_py_save, - 'test_ds_aio_fast_save': test_ds_aio_fast_save, + 'test_save': test_save, + 'test_ds_mock_save': test_ds_mock_save, + 'test_ds_py_save': test_ds_py_save, + # 'test_ds_aio_fast_save': test_ds_aio_fast_save, 'test_ds_gds_fast_save': test_ds_gds_fast_save } for tag, fn in fn_dict.items(): @@ -39,6 +81,7 @@ def run(args): print(f'*********************************************') + def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--folder', @@ -89,6 +132,7 @@ def main(): quit() deepspeed.init_distributed() run(args) + dist.destroy_process_group() if __name__ == "__main__": diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/deepnvme/model_checkpoint/torch_save_utils.py similarity index 100% rename from fast_io/model_checkpoint/torch_save_utils.py rename to deepnvme/model_checkpoint/torch_save_utils.py diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt deleted file mode 100644 index 33985e8db..000000000 --- a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt +++ /dev/null @@ -1,599 +0,0 @@ -Performance test of deepspeed integration of fast model checkpointing. -torch version = 1.12.0+cu113 -args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) -Model name = gpt2-large -[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... -[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 -[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl -[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -NCCL version 2.10.3+cuda11.3 -[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:22,130] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.3399326801300049 seconds -[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt -test_save -- 10.13 GB, 6.83 secs, 1.48 gb/s -********************************************* -[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:30,659] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "mock", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0004949569702148438 seconds -[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} -test_ds_mock_save -- 0.00 GB, 0.93 secs, 0.00 gb/s -********************************************* -[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:33,250] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "python", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.000392913818359375 seconds -[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196} -test_ds_py_save -- 10.13 GB, 6.32 secs, 1.60 gb/s -********************************************* -[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... -Building extension module async_io... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module async_io... -Time to load async_io op: 0.4869067668914795 seconds -[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:41,332] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "fast", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0004849433898925781 seconds -[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0003745555877685547 seconds -stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0} -test_ds_fast_save -- 10.13 GB, 3.00 secs, 3.38 gb/s -********************************************* diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt deleted file mode 100644 index 9871b634e..000000000 --- a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt +++ /dev/null @@ -1,781 +0,0 @@ -Performance test of deepspeed integration of fast model checkpointing. -torch version = 1.12.0+cu113 -args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2) -Model name = gpt2-large -[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... -[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 -[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl -[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -NCCL version 2.10.3+cuda11.3 -[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam... -Detected CUDA files, patching ldflags -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja... -Building extension module fused_adam... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -[1/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o -[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o -[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so -Loading extension module fused_adam... -Time to load fused_adam op: 19.252447843551636 seconds -[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.3341379165649414 seconds -[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB -[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.58 GB, percent = 1.3% -[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB -[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% -[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB -[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% -[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB -[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.53 GB, percent = 1.3% -[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB -[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.4 GB, percent = 1.3% -[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 7.38 GB Max_CA 7 GB -[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 7.38 GB Max_CA 7 GB -[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 16.04 GB Max_CA 16 GB -[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 16.04 GB Max_CA 16 GB -[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:45:48,972] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0004029273986816406 seconds -[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt -[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py -[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_save -- 10.13 GB, 7.51 secs, 1.35 gb/s -********************************************* -[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module fused_adam, skipping build step... -Loading extension module fused_adam... -Time to load fused_adam op: 0.0011363029479980469 seconds -[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00023317337036132812 seconds -[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 16 GB -[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.55 GB, percent = 1.3% -[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.54 GB, percent = 1.3% -[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.51 GB, percent = 1.3% -[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% -[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% -[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% -[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.38 GB, percent = 1.3% -[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% -[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:45:57,937] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "mock", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.000377655029296875 seconds -[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} -stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} -[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py -[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_ds_mock_save -- 0.00 GB, 2.02 secs, 0.00 gb/s -********************************************* -[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module fused_adam, skipping build step... -Loading extension module fused_adam... -Time to load fused_adam op: 0.001192331314086914 seconds -[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0002560615539550781 seconds -[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB -[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.63 GB, percent = 1.3% -[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% -[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% -[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% -[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:46:02,176] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "python", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0003757476806640625 seconds -[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764} -stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899} -[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py -[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_ds_py_save -- 10.13 GB, 7.23 secs, 1.40 gb/s -********************************************* -[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module fused_adam, skipping build step... -Loading extension module fused_adam... -Time to load fused_adam op: 0.0010247230529785156 seconds -[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0002410411834716797 seconds -[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB -[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.6 GB, percent = 1.3% -[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.59 GB, percent = 1.3% -[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% -[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% -[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... -Building extension module async_io... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module async_io... -Time to load async_io op: 0.5492517948150635 seconds -[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:46:13,143] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "fast", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00046539306640625 seconds -[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0002307891845703125 seconds -stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0003643035888671875 seconds -stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0} -[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py -[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_ds_fast_save -- 10.13 GB, 3.94 secs, 2.57 gb/s -********************************************* diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt deleted file mode 100644 index 5a5292f6e..000000000 --- a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt +++ /dev/null @@ -1,72 +0,0 @@ -Performance test of deepspeed integration of fast model checkpointing. -torch version = 1.12.0+cu113 -args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) -Model name = gpt2-large -[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3 -[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... --------------------------------------------------------------------------- -WARNING: No preset parameters were found for the device that Open MPI -detected: - - Local host: azwuse57c00009D - Device name: mlx5_ib0 - Device vendor ID: 0x02c9 - Device vendor part ID: 4124 - -Default device parameters will be used, which may result in lower -performance. You can edit any of the files specified by the -btl_openib_device_param_files MCA parameter to set values for your -device. - -NOTE: You can turn off this warning by setting the MCA parameter - btl_openib_warn_no_device_params_found to 0. --------------------------------------------------------------------------- --------------------------------------------------------------------------- -By default, for Open MPI 4.0 and later, infiniband ports on a device -are not used by default. The intent is to use UCX for these devices. -You can override this policy by setting the btl_openib_allow_ib MCA parameter -to true. - - Local host: azwuse57c00009D - Local adapter: mlx5_ib0 - Local port: 1 - --------------------------------------------------------------------------- --------------------------------------------------------------------------- -WARNING: There was an error initializing an OpenFabrics device. - - Local host: azwuse57c00009D - Local device: mlx5_ib4 --------------------------------------------------------------------------- -[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 -[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl -[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead -NCCL version 2.10.3+cuda11.3 -[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False -[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} -Traceback (most recent call last): - File "deepspeed_save_model.py", line 133, in - main() - File "deepspeed_save_model.py", line 129, in main - run(model, model_name, ckpt_name, args) - File "deepspeed_save_model.py", line 106, in run - write_sec = test_save(tag, folder, model, args, writer_type) - File "deepspeed_save_model.py", line 76, in test_save - ds_engine = _get_ds_engine(model, ds_config) - File "deepspeed_save_model.py", line 52, in _get_ds_engine - ds_engine, _, _, _ = deepspeed.initialize( - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize - engine = DeepSpeedEngine(args=args, - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__ - self._configure_optimizer(optimizer, model_parameters) - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer - self.optimizer = self._configure_fp16_optimizer(basic_optimizer) - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer - or self.fp16_fused_mode() \ - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode - return self._config.fp16_fused_mode -AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode' -[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found -[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages -[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected From 4059f805893e73450c0c41f9debe82cc15fd5df5 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 26 Mar 2025 09:46:11 -0400 Subject: [PATCH 31/40] Remove folder Signed-off-by: Olatunji Ruwase --- .../model_checkpoint/checkpoint_compare.py | 123 --- .../model_checkpoint/deepspeed_save_model.py | 139 ---- .../log_9_21_22/gpt2-unfused.txt | 599 -------------- .../log_9_21_22/gpt2_fused_z2.txt | 781 ------------------ .../log_9_21_22/torch_star_half_error.txt | 72 -- fast_io/model_checkpoint/requirements.txt | 1 - fast_io/model_checkpoint/save_model_utils.py | 116 --- fast_io/model_checkpoint/torch_save_model.py | 76 -- fast_io/model_checkpoint/torch_save_tensor.py | 95 --- fast_io/model_checkpoint/torch_save_utils.py | 111 --- 10 files changed, 2113 deletions(-) delete mode 100644 fast_io/model_checkpoint/checkpoint_compare.py delete mode 100644 fast_io/model_checkpoint/deepspeed_save_model.py delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt delete mode 100644 fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt delete mode 100644 fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt delete mode 100644 fast_io/model_checkpoint/requirements.txt delete mode 100644 fast_io/model_checkpoint/save_model_utils.py delete mode 100644 fast_io/model_checkpoint/torch_save_model.py delete mode 100644 fast_io/model_checkpoint/torch_save_tensor.py delete mode 100644 fast_io/model_checkpoint/torch_save_utils.py diff --git a/fast_io/model_checkpoint/checkpoint_compare.py b/fast_io/model_checkpoint/checkpoint_compare.py deleted file mode 100644 index cc67b61d9..000000000 --- a/fast_io/model_checkpoint/checkpoint_compare.py +++ /dev/null @@ -1,123 +0,0 @@ -#This script is for testing whether two checkpoints match; it prints all the differences - -import torch -import os -import sys -import pickle -from collections import OrderedDict - -exclude_key_str = {'ds_config/checkpoint/writer'} - -def main(): - dir1 = sys.argv[1] - dir2 = sys.argv[2] - print ("Begin comparison") - print ("The first directory {}" .format(dir1)) - print ("The second directory {}" .format(dir2)) - print (' ') - - file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))] - file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))] - common_files = [] - - for f in file_list1: - if not (f in file_list2): - log_error_file_mismatch_first(f) - else: - common_files.append(f) - for f in file_list2: - if not (f in file_list1): - log_error_file_mismatch_second(f) - - for f in common_files: - full_dir1 = os.path.join(dir1, f) - full_dir2 = os.path.join(dir2, f) - print ("Begin comparison") - print("The first checkpoint {}" .format(full_dir1)) - print("The second checkpoint {}" .format(full_dir2)) - print(' ') - model_first = torch.load(full_dir1) - model_second = torch.load(full_dir2) - object_compare(model_first, model_second, []) - - -def object_compare(model_first, model_second, key_chain): - if not (type(model_first) == type(model_second)): - log_error_value_mismatch(model_first, model_second, key_chain) - return - - if type(model_first) is list: - if len(model_first) != len(model_second): - log_error_value_mismatch(model_first, model_second, key_chain) - return - for i in range(len(model_first)): - object_compare(model_first[i], model_second[i], key_chain) - return - - if type(model_first) is dict or type(model_first) is OrderedDict: - common_keys = [] - for key in model_first: - if key not in model_second: - key_chain.append(key) - log_error_key_mismatch_first(model_first[key], key_chain) - key_chain.pop() - else: - common_keys.append(key) - - for key in model_second: - if key not in model_first: - key_chain.append(key) - log_error_key_mismatch_second(model_second[key], key_chain) - key_chain.pop() - - for key in common_keys: - key_chain.append(key) - object_compare(model_first[key], model_second[key], key_chain) - key_chain.pop() - return - - if hasattr(model_first, '__dict__'): - equality = (model_first.__dict__ == model_second.__dict__) - else: - equality = (model_first == model_second) - if type(equality) is not bool: - equality = (equality.all()) - if not equality: - log_error_value_mismatch(model_first, model_second, key_chain) - return - - -def log_error_file_mismatch_first(filename): - print("The following file appeared in the first but not the second directory: {}" .format(filename)) - print(' ') - - -def log_error_file_mismatch_second(filename): - print("The following key appeared in the second but not the first directory: {}" .format(filename)) - print(" ") - - -def log_error_key_mismatch_first(model, key_chain): - key_str = "/".join(key_chain) - if not (key_str in exclude_key_str): - print("The following key appeared in the first but not the second model: {}" .format(key_str)) - print("The value of the first model is: {}" .format(model)) - print(" ") - - -def log_error_key_mismatch_second(model, key_chain): - key_str = "/".join(key_chain) - if not (key_str in exclude_key_str): - print("The following key appeared in the second but not the first model: {}" .format(key_str)) - print("The value of the second model is: {}" .format(model)) - print(" ") - - -def log_error_value_mismatch(model_first, model_second, key_chain): - print ("The values of the following key do not match: {}" .format("/".join(key_chain))) - print ("The value of the first model is: {}" .format(model_first)) - print ("The value of the second model is: {}" .format(model_second)) - print(" ") - -if __name__ == "__main__": - main() diff --git a/fast_io/model_checkpoint/deepspeed_save_model.py b/fast_io/model_checkpoint/deepspeed_save_model.py deleted file mode 100644 index ea97dd717..000000000 --- a/fast_io/model_checkpoint/deepspeed_save_model.py +++ /dev/null @@ -1,139 +0,0 @@ -import time -import torch -import os -import shutil -import gc -import random -import numpy as np -import deepspeed -from deepspeed.accelerator import get_accelerator -from save_model_utils import get_model, validate_arguments, parse_arguments - -def _get_ds_config(args, writer_type, use_gds): - ds_config = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": args.zero_stage, - "cpu_offload": args.cpu_offload - }, - "fp16": { - "enabled": args.half - }, - "optimizer": { - "type": "Adam", - "params": { - "torch_adam": not args.fused - } - }, - "checkpoint": { - "checkpoint_serialization": not args.legacy - }, - "aio": { - "block_size": 8 * (1024**2), - "queue_depth": 8, - "single_submit": False, - "overlap_events": True, - "intra_op_parallelism": 2, - "use_gds": use_gds, - } - } - - if writer_type: - ds_config["checkpoint"]["writer"] = { - "type": writer_type, - "io_buffer_size": args.io_buffer_mb * (1024**2), - "io_buffer_double": not args.single_io_buffer, - "show_statistics": not args.no_statistics, - "data_parallel": "socket" # None # not args.single_writer - } - - return ds_config - - -def _get_ds_engine(model, ds_config): - ds_engine, _, _, _ = deepspeed.initialize( - model=model, model_parameters=model.parameters(), config=ds_config) - - return ds_engine - - -def _do_optimizer_step(ds_engine): - for p in ds_engine.module.parameters(): - p.grad = torch.zeros_like(p) - ds_engine.step() - - -def _free_ds_memory(ds_engine): - ds_engine.optimizer.optimizer = None - ds_engine.optimizer = None - ds_engine.module = None - ds_engine = None - del ds_engine - gc.collect() - get_accelerator().empty_cache() - - -def test_save(tag, folder, model, args, writer_type): - use_gds = writer_type == 'fast' and 'gds' in tag - ds_config = _get_ds_config(args, writer_type, use_gds) - ds_engine = _get_ds_engine(model, ds_config) - if args.zero_stage == 0: - _do_optimizer_step(ds_engine) - - st = time.time() - ds_engine.save_checkpoint(save_dir=folder, tag=tag) - write_sec = time.time() - st - _free_ds_memory(ds_engine) - return write_sec - - -def _get_folder_size(folder): - size = 0 - for path, _, files in os.walk(folder): - size += sum([os.path.getsize(os.path.join(path, f)) for f in files]) - return size - - -def run(model, model_name, ckpt_name, args): - print(f'Model name = {model_name}') - writer_dict = { - 'test_save': None, - 'test_ds_mock_save': 'mock', - 'test_ds_py_save': 'python', - 'test_ds_aio_fast_save': 'fast', - 'test_ds_gds_fast_save': 'fast', - } - for tag, writer_type in writer_dict.items(): - folder = os.path.join(args.folder, ckpt_name, tag) - if os.path.exists(folder): - shutil.rmtree(folder, ignore_errors=True) - # if not os.path.exists(folder): - # os.makedirs(folder, exist_ok=True) - write_sec = test_save(tag, folder, model, args, writer_type) - ckpt_size = _get_folder_size(folder) - gb_size = ckpt_size / (1024**3) - gb_per_sec = gb_size / write_sec - print( - f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' - ) - print(f'*********************************************') - - -def main(): - print( - f'Performance test of deepspeed integration of fast model checkpointing.' - ) - print(f'torch version = {torch.__version__}') - torch.manual_seed(42) - np.random.seed(0) - random.seed(0) - args = parse_arguments() - if not validate_arguments(args): - quit() - - model, model_name, ckpt_name = get_model(args.model) - run(model, model_name, ckpt_name, args) - - -if __name__ == "__main__": - main() diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt deleted file mode 100644 index 33985e8db..000000000 --- a/fast_io/model_checkpoint/log_9_21_22/gpt2-unfused.txt +++ /dev/null @@ -1,599 +0,0 @@ -Performance test of deepspeed integration of fast model checkpointing. -torch version = 1.12.0+cu113 -args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) -Model name = gpt2-large -[2022-09-21 18:42:17,245] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:17,246] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... -[2022-09-21 18:42:18,108] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 -[2022-09-21 18:42:18,109] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl -[2022-09-21 18:42:21,535] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -NCCL version 2.10.3+cuda11.3 -[2022-09-21 18:42:21,770] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:21,772] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:21,772] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:21,772] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:22,127] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:22,127] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:42:22,127] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:22,128] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:22,129] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:22,130] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:22,130] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.3399326801300049 seconds -[2022-09-21 18:42:23,204] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt -test_save -- 10.13 GB, 6.83 secs, 1.48 gb/s -********************************************* -[2022-09-21 18:42:30,157] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:30,164] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:42:30,277] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:30,278] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:30,278] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:30,278] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:30,656] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:30,656] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:42:30,656] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:30,657] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:30,658] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:30,659] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:30,659] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "mock", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0004949569702148438 seconds -[2022-09-21 18:42:30,786] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} -test_ds_mock_save -- 0.00 GB, 0.93 secs, 0.00 gb/s -********************************************* -[2022-09-21 18:42:32,824] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:32,831] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:42:32,926] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:32,927] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:32,927] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:32,927] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:33,248] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:33,248] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:42:33,248] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:33,248] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:33,249] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:33,250] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:33,250] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "python", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.000392913818359375 seconds -[2022-09-21 18:42:33,377] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523621, 'write_secs': 5.274229288101196} -test_ds_py_save -- 10.13 GB, 6.32 secs, 1.60 gb/s -********************************************* -[2022-09-21 18:42:39,940] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:42:39,946] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:42:40,048] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -[2022-09-21 18:42:40,049] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:42:40,049] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = AdamW -[2022-09-21 18:42:40,049] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 unfused optimizer with dynamic loss scale -[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:42:40,439] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:42:40,439] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:42:40,440] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... -Building extension module async_io... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module async_io... -Time to load async_io op: 0.4869067668914795 seconds -[2022-09-21 18:42:41,329] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:42:41,330] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:42:41,331] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_config .................. { - "stage": 0, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_enabled ................. False -[2022-09-21 18:42:41,332] [INFO] [config.py:886:print] zero_optimization_stage ...... 0 -[2022-09-21 18:42:41,332] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 0, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "fast", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0004849433898925781 seconds -[2022-09-21 18:42:41,458] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0003745555877685547 seconds -stats = {'close': 1, 'fileno': 2252, 'flush': 2, 'write': 4509, 'bytes': 10874523619, 'write_secs': 1.8456230163574219, 'aio_write_secs': 0.9408478736877441, 'aio_bytes': 10874523136, 'aio_gbs': 10.76442766994695, 'slow_bytes': 483, 'slow_write_secs': 0.0002315044403076172, 'fill_buffer_count': 4519, 'fill_buffer_secs': 0.9024286270141602, 'fill_buffer_speed': 11.22270347101499, 'save_storage': 0, 'save_storage_bytes': 0} -test_ds_fast_save -- 10.13 GB, 3.00 secs, 3.38 gb/s -********************************************* diff --git a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt b/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt deleted file mode 100644 index 9871b634e..000000000 --- a/fast_io/model_checkpoint/log_9_21_22/gpt2_fused_z2.txt +++ /dev/null @@ -1,781 +0,0 @@ -Performance test of deepspeed integration of fast model checkpointing. -torch version = 1.12.0+cu113 -args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=True, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=2) -Model name = gpt2-large -[2022-09-21 18:45:23,129] [INFO] [logging.py:60:log_dist] [Rank -1] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:45:23,130] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment... -[2022-09-21 18:45:23,991] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 -[2022-09-21 18:45:23,991] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl -[2022-09-21 18:45:27,189] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -NCCL version 2.10.3+cuda11.3 -[2022-09-21 18:45:27,478] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Creating extension directory /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam... -Detected CUDA files, patching ldflags -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/fused_adam/build.ninja... -Building extension module fused_adam... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -[1/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options '-fPIC' -lineinfo -O3 --use_fast_math -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -std=c++14 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o -[2/3] c++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -I/home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/includes -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -O3 -std=c++14 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -c /home/guanhuawang/DeepSpeed-internal/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o -[3/3] c++ fused_adam_frontend.o multi_tensor_adam.cuda.o -shared -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda_cu -ltorch_cuda_cpp -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o fused_adam.so -Loading extension module fused_adam... -Time to load fused_adam op: 19.252447843551636 seconds -[2022-09-21 18:45:47,263] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:45:47,263] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:45:47,263] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:45:47,263] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:45:47,263] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:45:47,263] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/utils/build.ninja... -Building extension module utils... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module utils... -Time to load utils op: 0.3341379165649414 seconds -[2022-09-21 18:45:47,651] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:45:47,652] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB -[2022-09-21 18:45:47,652] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.58 GB, percent = 1.3% -[2022-09-21 18:45:47,945] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:45:47,946] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.61 GB Max_CA 2 GB -[2022-09-21 18:45:47,946] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% -[2022-09-21 18:45:48,634] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:45:48,635] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB -[2022-09-21 18:45:48,635] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% -[2022-09-21 18:45:48,681] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:45:48,682] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB -[2022-09-21 18:45:48,682] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.53 GB, percent = 1.3% -[2022-09-21 18:45:48,733] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:45:48,734] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 3.06 GB Max_CA 3 GB -[2022-09-21 18:45:48,734] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.4 GB, percent = 1.3% -[2022-09-21 18:45:48,796] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:45:48,797] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 7.38 GB Max_CA 7 GB -[2022-09-21 18:45:48,797] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,848] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:45:48,849] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 7.38 GB Max_CA 7 GB -[2022-09-21 18:45:48,849] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,920] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:45:48,921] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 16.04 GB Max_CA 16 GB -[2022-09-21 18:45:48,921] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,921] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:45:48,968] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:45:48,969] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 16.04 GB Max_CA 16 GB -[2022-09-21 18:45:48,969] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:45:48,969] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:45:48,969] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:45:48,969] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': None} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:45:48,970] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:45:48,971] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:45:48,972] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:45:48,972] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0004029273986816406 seconds -[2022-09-21 18:45:49,143] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/mp_rank_00_model_states.pt -[2022-09-21 18:45:56,478] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_save/zero_to_fp32.py -[2022-09-21 18:45:56,479] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_save/test_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_save -- 10.13 GB, 7.51 secs, 1.35 gb/s -********************************************* -[2022-09-21 18:45:56,603] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:45:56,610] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:45:56,709] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module fused_adam, skipping build step... -Loading extension module fused_adam... -Time to load fused_adam op: 0.0011363029479980469 seconds -[2022-09-21 18:45:56,771] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:45:56,771] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:45:56,771] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:45:56,771] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:45:56,771] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:45:56,771] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00023317337036132812 seconds -[2022-09-21 18:45:56,823] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:45:56,824] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 16 GB -[2022-09-21 18:45:56,824] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.55 GB, percent = 1.3% -[2022-09-21 18:45:57,123] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:45:57,124] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,124] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.54 GB, percent = 1.3% -[2022-09-21 18:45:57,614] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:45:57,615] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,616] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.51 GB, percent = 1.3% -[2022-09-21 18:45:57,661] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:45:57,662] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,662] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.52 GB, percent = 1.3% -[2022-09-21 18:45:57,713] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:45:57,714] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:45:57,714] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% -[2022-09-21 18:45:57,775] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:45:57,775] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:45:57,776] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.41 GB, percent = 1.3% -[2022-09-21 18:45:57,827] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:45:57,828] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:45:57,828] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% -[2022-09-21 18:45:57,887] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:45:57,887] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:45:57,888] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.38 GB, percent = 1.3% -[2022-09-21 18:45:57,888] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:45:57,933] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:45:57,934] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:45:57,934] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.37 GB, percent = 1.3% -[2022-09-21 18:45:57,934] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:45:57,935] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:45:57,935] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:45:57,935] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'MOCK', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:45:57,935] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:45:57,936] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:45:57,937] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:45:57,937] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "mock", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.000377655029296875 seconds -[2022-09-21 18:45:57,942] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} -stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 0, 'save_storage': 0, 'save_storage_bytes': 0} -[2022-09-21 18:45:59,953] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/zero_to_fp32.py -[2022-09-21 18:45:59,953] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_mock_save/test_ds_mock_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_ds_mock_save -- 0.00 GB, 2.02 secs, 0.00 gb/s -********************************************* -[2022-09-21 18:46:00,921] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:46:00,928] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:46:01,026] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module fused_adam, skipping build step... -Loading extension module fused_adam... -Time to load fused_adam op: 0.001192331314086914 seconds -[2022-09-21 18:46:01,079] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:46:01,079] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:46:01,079] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:46:01,079] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:46:01,080] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:46:01,080] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0002560615539550781 seconds -[2022-09-21 18:46:01,130] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:46:01,131] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB -[2022-09-21 18:46:01,132] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.63 GB, percent = 1.3% -[2022-09-21 18:46:01,426] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:46:01,427] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,427] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% -[2022-09-21 18:46:01,861] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:46:01,862] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,863] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% -[2022-09-21 18:46:01,907] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:46:01,908] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,908] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.56 GB, percent = 1.3% -[2022-09-21 18:46:01,959] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:46:01,960] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:01,960] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,013] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:46:02,013] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:02,014] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,065] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:46:02,066] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:02,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,125] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:46:02,126] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:02,126] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,126] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:46:02,172] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:46:02,173] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:02,173] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.44 GB, percent = 1.3% -[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:46:02,174] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:46:02,174] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -[2022-09-21 18:46:02,174] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'PYTHON', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:46:02,174] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:46:02,175] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:46:02,176] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:46:02,176] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "python", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0003757476806640625 seconds -[2022-09-21 18:46:02,181] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/mp_rank_00_model_states.pt -stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909547, 'write_secs': 0.7758586406707764} -stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390323, 'write_secs': 4.455736398696899} -[2022-09-21 18:46:09,408] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/zero_to_fp32.py -[2022-09-21 18:46:09,409] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_py_save/test_ds_py_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_ds_py_save -- 10.13 GB, 7.23 secs, 1.40 gb/s -********************************************* -[2022-09-21 18:46:09,498] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.4.1+a4269a63, git-hash=a4269a63, git-branch=guanhua/staging-fast-ckpt-v2 -[2022-09-21 18:46:09,504] [INFO] [utils.py:11:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1 -[2022-09-21 18:46:09,602] [INFO] [engine.py:176:__init__] DeepSpeed Flops Profiler Enabled: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module fused_adam, skipping build step... -Loading extension module fused_adam... -Time to load fused_adam op: 0.0010247230529785156 seconds -[2022-09-21 18:46:09,666] [INFO] [engine.py:706:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-21 18:46:09,666] [INFO] [engine.py:711:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam -Checking ZeRO support for optimizer=FusedAdam type= -[2022-09-21 18:46:09,666] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer -[2022-09-21 18:46:09,666] [INFO] [stage2.py:105:__init__] Reduce bucket size 500000000 -[2022-09-21 18:46:09,666] [INFO] [stage2.py:106:__init__] Allgather bucket size 500000000 -[2022-09-21 18:46:09,666] [INFO] [stage2.py:107:__init__] CPU Offload: False -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0002410411834716797 seconds -[2022-09-21 18:46:09,746] [INFO] [utils.py:588:see_memory_usage] Before moving param group 0 to CPU -[2022-09-21 18:46:09,747] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 10.13 GB CA 1.48 GB Max_CA 14 GB -[2022-09-21 18:46:09,747] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 22.6 GB, percent = 1.3% -[2022-09-21 18:46:10,065] [INFO] [utils.py:588:see_memory_usage] After moving param group 0 to CPU -[2022-09-21 18:46:10,066] [INFO] [utils.py:589:see_memory_usage] MA 0.04 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:10,066] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.59 GB, percent = 1.3% -[2022-09-21 18:46:11,872] [INFO] [utils.py:588:see_memory_usage] After flattening and moving param group 0 to GPU -[2022-09-21 18:46:11,873] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:11,873] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% -[2022-09-21 18:46:11,918] [INFO] [utils.py:588:see_memory_usage] After Flattening and after emptying param group 0 cache -[2022-09-21 18:46:11,919] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:11,919] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.58 GB, percent = 1.3% -[2022-09-21 18:46:11,969] [INFO] [utils.py:588:see_memory_usage] Before creating fp32 master weights for param group 0 -[2022-09-21 18:46:11,970] [INFO] [utils.py:589:see_memory_usage] MA 1.48 GB Max_MA 1.48 GB CA 1.48 GB Max_CA 1 GB -[2022-09-21 18:46:11,971] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,030] [INFO] [utils.py:588:see_memory_usage] After creating fp32 master weights for param group 0 -[2022-09-21 18:46:12,030] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 5.8 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:12,031] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,081] [INFO] [utils.py:588:see_memory_usage] Before initializing optimizer states -[2022-09-21 18:46:12,082] [INFO] [utils.py:589:see_memory_usage] MA 4.36 GB Max_MA 4.36 GB CA 5.81 GB Max_CA 6 GB -[2022-09-21 18:46:12,082] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,141] [INFO] [utils.py:588:see_memory_usage] After initializing optimizer states -[2022-09-21 18:46:12,142] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 13.01 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:12,142] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,142] [INFO] [stage2.py:415:__init__] optimizer state initialized -[2022-09-21 18:46:12,188] [INFO] [utils.py:588:see_memory_usage] After initializing ZeRO optimizer -[2022-09-21 18:46:12,188] [INFO] [utils.py:589:see_memory_usage] MA 10.13 GB Max_MA 10.13 GB CA 14.46 GB Max_CA 14 GB -[2022-09-21 18:46:12,189] [INFO] [utils.py:597:see_memory_usage] CPU Virtual Memory: used = 23.46 GB, percent = 1.3% -[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed Final Optimizer = adam -[2022-09-21 18:46:12,189] [INFO] [engine.py:524:_configure_lr_scheduler] DeepSpeed using client LR scheduler -[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed LR Scheduler = None -[2022-09-21 18:46:12,189] [INFO] [logging.py:60:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001], mom=[(0.9, 0.999)] -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -Emitting ninja build file /home/guanhuawang/.cache/torch_extensions/py38_cu113/async_io/build.ninja... -Building extension module async_io... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module async_io... -Time to load async_io op: 0.5492517948150635 seconds -[2022-09-21 18:46:13,140] [INFO] [config.py:882:print] DeepSpeedEngine configuration: -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] activation_checkpointing_config { - "partition_activations": false, - "contiguous_memory_optimization": false, - "cpu_checkpointing": false, - "number_checkpoints": null, - "synchronize_checkpoint_boundary": false, - "profile": false -} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] aio_config ................... {'block_size': 8388608, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': False} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] allreduce_always_fp32 ........ False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_enabled .................. False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] amp_params ................... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] checkpoint_config ............ {'tag_validation': 'WARN', 'checkpoint_serialization': False, 'writer': {'type': 'FAST', 'io_buffer_size': 1073741824, 'io_buffer_double': False, 'show_statistics': True}} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] disable_allgather ............ False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dump_state ................... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] dynamic_loss_scale_args ...... None -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_enabled ........... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_gas_boundary_resolution 1 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_name ........ bert.encoder.layer -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_layer_num ......... 0 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_max_iter .......... 100 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_stability ......... 1e-06 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_tol ............... 0.01 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] eigenvalue_verbose ........... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] elasticity_enabled ........... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] flops_profiler_config ........ { - "enabled": false, - "profile_step": 1, - "module_depth": -1, - "top_modules": 1, - "detailed": true, - "output_file": null -} -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_enabled ................. True -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] fp16_mixed_quantize .......... False -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] global_rank .................. 0 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_accumulation_steps .. 1 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_clipping ............ 0.0 -[2022-09-21 18:46:13,141] [INFO] [config.py:886:print] gradient_predivide_factor .... 1.0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] initial_dynamic_scale ........ 4294967296 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] loss_scale ................... 0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] memory_breakdown ............. False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_legacy_fusion ...... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_name ............... adam -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] optimizer_params ............. {} -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_enabled .................. False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] pld_params ................... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] prescale_gradients ........... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_change_rate ......... 0.001 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_groups .............. 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_offset .............. 1000 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_period .............. 1000 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_rounding ............ 0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_start_bits .......... 16 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_target_bits ......... 8 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_training_enabled .... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_type ................ 0 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] quantize_verbose ............. False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_name ............... None -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] scheduler_params ............. None -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_attention ............. None -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] sparse_gradients_enabled ..... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] steps_per_print .............. 10 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_enabled .......... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_job_name ......... DeepSpeedJobName -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] tensorboard_output_path ...... -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_batch_size ............. 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] train_micro_batch_size_per_gpu 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] use_quantizer_kernel ......... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] wall_clock_breakdown ......... False -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] world_size ................... 1 -[2022-09-21 18:46:13,142] [INFO] [config.py:886:print] zero_allow_untested_optimizer False -[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_config .................. { - "stage": 2, - "contiguous_gradients": false, - "reduce_scatter": true, - "reduce_bucket_size": 5.000000e+08, - "allgather_partitions": true, - "allgather_bucket_size": 5.000000e+08, - "overlap_comm": false, - "load_from_fp32_weights": true, - "elastic_checkpoint": true, - "offload_param": null, - "offload_optimizer": null, - "sub_group_size": 1.000000e+12, - "prefetch_bucket_size": 5.000000e+07, - "param_persistence_threshold": 1.000000e+05, - "max_live_parameters": 1.000000e+09, - "max_reuse_distance": 1.000000e+09, - "gather_fp16_weights_on_model_save": false, - "ignore_unused_parameters": true, - "legacy_stage1": false -} -[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_enabled ................. True -[2022-09-21 18:46:13,143] [INFO] [config.py:886:print] zero_optimization_stage ...... 2 -[2022-09-21 18:46:13,143] [INFO] [config.py:888:print] json = { - "train_micro_batch_size_per_gpu": 1, - "zero_optimization": { - "stage": 2, - "cpu_offload": false - }, - "fp16": { - "enabled": true - }, - "optimizer": { - "type": "Adam", - "params": { - } - }, - "checkpoint": { - "checkpoint_serialization": false, - "writer": { - "type": "fast", - "io_buffer_size": 1.073742e+09, - "io_buffer_double": false, - "show_statistics": true - } - }, - "aio": { - "block_size": 8.388608e+06, - "queue_depth": 8, - "single_submit": false, - "overlap_events": false, - "thread_count": 1 - } -} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.00046539306640625 seconds -[2022-09-21 18:46:13,149] [INFO] [logging.py:60:log_dist] [Rank 0] Saving model checkpoint: /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/mp_rank_00_model_states.pt -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0002307891845703125 seconds -stats = {'close': 1, 'fileno': 73, 'flush': 2, 'write': 152, 'bytes': 1585909545, 'write_secs': 0.4641237258911133, 'aio_write_secs': 0.17467093467712402, 'aio_bytes': 1585909248, 'aio_gbs': 8.455860654115417, 'slow_bytes': 297, 'slow_write_secs': 0.00024700164794921875, 'fill_buffer_count': 153, 'fill_buffer_secs': 0.3299696445465088, 'fill_buffer_speed': 4.476148362022062, 'save_storage': 0, 'save_storage_bytes': 0} -Using /home/guanhuawang/.cache/torch_extensions/py38_cu113 as PyTorch extensions root... -No modifications detected for re-loaded extension module utils, skipping build step... -Loading extension module utils... -Time to load utils op: 0.0003643035888671875 seconds -stats = {'close': 1, 'fileno': 3, 'flush': 2, 'write': 17, 'bytes': 9288390321, 'write_secs': 1.366792917251587, 'aio_write_secs': 0.8517467975616455, 'aio_bytes': 9288390144, 'aio_gbs': 10.156172524167351, 'slow_bytes': 177, 'slow_write_secs': 0.0003936290740966797, 'fill_buffer_count': 25, 'fill_buffer_secs': 0.5708425045013428, 'fill_buffer_speed': 15.153895084423882, 'save_storage': 0, 'save_storage_bytes': 0} -[2022-09-21 18:46:17,080] [INFO] [engine.py:1961:_copy_recovery_script] creating recovery script /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/zero_to_fp32.py -[2022-09-21 18:46:17,080] [INFO] [engine.py:1975:_save_zero_checkpoint] zero checkpoint saved /home/guanhuawang/eclipse/gpt2-large/test_ds_fast_save/test_ds_fast_save/zero_pp_rank_0_mp_rank_00_optim_states.pt -test_ds_fast_save -- 10.13 GB, 3.94 secs, 2.57 gb/s -********************************************* diff --git a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt b/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt deleted file mode 100644 index 5a5292f6e..000000000 --- a/fast_io/model_checkpoint/log_9_21_22/torch_star_half_error.txt +++ /dev/null @@ -1,72 +0,0 @@ -Performance test of deepspeed integration of fast model checkpointing. -torch version = 1.12.0+cu113 -args = Namespace(cpu_offload=False, folder='/home/guanhuawang/eclipse', fused=False, gpu=False, half=True, io_buffer_mb=1024, legacy=True, model='gpt2-large', no_statistics=False, optimizer=False, single_io_buffer=True, zero_stage=0) -Model name = gpt2-large -[2022-09-22 01:29:33,721] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.4+74104af1, git-hash=74104af1, git-branch=staging-fast-model-checkpoint-v3 -[2022-09-22 01:29:33,725] [INFO] [comm.py:617:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment... --------------------------------------------------------------------------- -WARNING: No preset parameters were found for the device that Open MPI -detected: - - Local host: azwuse57c00009D - Device name: mlx5_ib0 - Device vendor ID: 0x02c9 - Device vendor part ID: 4124 - -Default device parameters will be used, which may result in lower -performance. You can edit any of the files specified by the -btl_openib_device_param_files MCA parameter to set values for your -device. - -NOTE: You can turn off this warning by setting the MCA parameter - btl_openib_warn_no_device_params_found to 0. --------------------------------------------------------------------------- --------------------------------------------------------------------------- -By default, for Open MPI 4.0 and later, infiniband ports on a device -are not used by default. The intent is to use UCX for these devices. -You can override this policy by setting the btl_openib_allow_ib MCA parameter -to true. - - Local host: azwuse57c00009D - Local adapter: mlx5_ib0 - Local port: 1 - --------------------------------------------------------------------------- --------------------------------------------------------------------------- -WARNING: There was an error initializing an OpenFabrics device. - - Local host: azwuse57c00009D - Local device: mlx5_ib4 --------------------------------------------------------------------------- -[2022-09-22 01:29:34,587] [INFO] [comm.py:669:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=192.168.1.46, master_port=29500 -[2022-09-22 01:29:34,587] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl -[2022-09-22 01:29:34,591] [WARNING] [config_utils.py:63:_process_deprecated_field] Config parameter cpu_offload is deprecated use offload_optimizer instead -NCCL version 2.10.3+cuda11.3 -[2022-09-22 01:29:38,429] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False -[2022-09-22 01:29:38,430] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer -[2022-09-22 01:29:38,461] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = {basic_optimizer.__class__.__name__} -Traceback (most recent call last): - File "deepspeed_save_model.py", line 133, in - main() - File "deepspeed_save_model.py", line 129, in main - run(model, model_name, ckpt_name, args) - File "deepspeed_save_model.py", line 106, in run - write_sec = test_save(tag, folder, model, args, writer_type) - File "deepspeed_save_model.py", line 76, in test_save - ds_engine = _get_ds_engine(model, ds_config) - File "deepspeed_save_model.py", line 52, in _get_ds_engine - ds_engine, _, _, _ = deepspeed.initialize( - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/__init__.py", line 124, in initialize - engine = DeepSpeedEngine(args=args, - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 322, in __init__ - self._configure_optimizer(optimizer, model_parameters) - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1178, in _configure_optimizer - self.optimizer = self._configure_fp16_optimizer(basic_optimizer) - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 1314, in _configure_fp16_optimizer - or self.fp16_fused_mode() \ - File "/home/guanhuawang/DeepSpeed-internal/deepspeed/runtime/engine.py", line 792, in fp16_fused_mode - return self._config.fp16_fused_mode -AttributeError: 'DeepSpeedConfig' object has no attribute 'fp16_fused_mode' -[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / no device params found -[azwuse57c00009D:37114] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages -[azwuse57c00009D:37114] 4 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected diff --git a/fast_io/model_checkpoint/requirements.txt b/fast_io/model_checkpoint/requirements.txt deleted file mode 100644 index 976a2b1f3..000000000 --- a/fast_io/model_checkpoint/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -transformers diff --git a/fast_io/model_checkpoint/save_model_utils.py b/fast_io/model_checkpoint/save_model_utils.py deleted file mode 100644 index faf4fc5d8..000000000 --- a/fast_io/model_checkpoint/save_model_utils.py +++ /dev/null @@ -1,116 +0,0 @@ -import argparse -import os -from transformers import AutoModelForCausalLM -from transformers import T5ForConditionalGeneration -from torch_save_utils import PINNED_BUFFER_MB - - -GPT2L = 'gpt2-large' -TINY_T5 = 'tiny-t5' -PHI3_MINI = 'phi3' -PHI3_VISION = 'phi3-v' -LLAMA3_1B = 'llama3-1B' - -HF_MODELS_DICT = { - TINY_T5: "hf-internal-testing/tiny-random-t5", - GPT2L: GPT2L, - PHI3_MINI: "microsoft/Phi-3.5-mini-instruct", - PHI3_VISION: "microsoft/Phi-3.5-vision-instruct", - LLAMA3_1B: "meta-llama/Llama-3.2-1B", -} - -def _get_hf_model(tag): - model_name = HF_MODELS_DICT[tag] - if tag == TINY_T5: - model = T5ForConditionalGeneration.from_pretrained(model_name) - else: - model = AutoModelForCausalLM.from_pretrained(model_name) - - return model, model_name, tag - -def get_model(model_tag): - return _get_hf_model(model_tag) - - -def validate_arguments(args): - success = True - - if not args.model in HF_MODELS_DICT: - print(f'{args.model} is not a supported HF model tag') - success = False - - if args.optimizer and args.half: - if not args.gpu: - print(f'mixed precision only supported with gpu tensors') - success = False - - return success - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--folder', - default=None, - type=str, - required=True, - help='Folder to use for I/O.') - - parser.add_argument( - '--model', - default=None, - type=str, - required=True, - help=f'HuggingFace tag of model. Available models = {list(HF_MODELS_DICT.keys())}') - - parser.add_argument('--local_rank', - type=int, - default=0, - help='Local rank' ) - - parser.add_argument('--legacy', - action='store_true', - help='Use torch legacy save format') - - parser.add_argument('--optimizer', - action='store_true', - help='Include optimizer state in checkpoint.') - - parser.add_argument('--fused', - action='store_true', - help='Use fused fp16 optimizer.') - - parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') - - parser.add_argument('--half', - action='store_true', - help='Use half-precision tensors.') - - parser.add_argument( - '--io_buffer_mb', - type=int, - default=PINNED_BUFFER_MB, - help=f'Size of pinned i/o buffer in MB. Default = {PINNED_BUFFER_MB}') - - parser.add_argument('--zero_stage', - type=int, - default=0, - help='ZeRO optimization stage. Default = 0') - - parser.add_argument('--cpu_offload', - action='store_true', - help='Enable CPU offload of optimizer state.') - - parser.add_argument('--no-statistics', - action='store_true', - help='Suppress low-level performance statistics.') - - parser.add_argument('--single_io_buffer', - action='store_true', - help='Disable double buffering of i/o buffer.') - - - #parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state') - - args = parser.parse_args() - print(f'args = {args}') - return args diff --git a/fast_io/model_checkpoint/torch_save_model.py b/fast_io/model_checkpoint/torch_save_model.py deleted file mode 100644 index 6c1103049..000000000 --- a/fast_io/model_checkpoint/torch_save_model.py +++ /dev/null @@ -1,76 +0,0 @@ -import time -import torch -from torch.optim import Adam -import os -from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save -from save_model_utils import get_model, validate_arguments, parse_arguments -import deepspeed -from deepspeed.accelerator import get_accelerator - - -def run(model, model_name, ckpt_name, args): - print(f'Model name = {model_name}') - fn_dict = { - 'test_save': test_save, - 'test_ds_mock_save': test_ds_mock_save, - 'test_ds_py_save': test_ds_py_save, - 'test_ds_gds_fast_save': test_ds_gds_fast_save, - 'test_ds_aio_fast_save': test_ds_aio_fast_save, - } - for tag, fn in fn_dict.items(): - if tag == 'test_ds_gds_fast_save' and not args.gpu: - continue - file = os.path.join(args.folder, f'{tag}_{ckpt_name}.pt') - print(f'checkpoint file = {file}') - if os.path.isfile(file): - os.remove(file) - st = time.time() - write_sec = fn(file, model, args) - ckpt_size = os.path.getsize(file) - gb_size = ckpt_size / (1024**3) - gb_per_sec = gb_size / write_sec - print( - f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' - ) - print(f'*********************************************') - - -def _get_initialized_optimizer(model, fused_opt): - base_optimizer = Adam(model.parameters()) - if fused_opt: - from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer as FP16_Wrapper - else: - from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer as FP16_Wrapper - optimizer = FP16_Wrapper(base_optimizer) - for p in model.parameters(): - p.grad = torch.zeros_like(p) - optimizer.step() - return optimizer - - -def main(): - print( - f'Performance test of torch.save() integration of fast model checkpointing.' - ) - print(f'torch version = {torch.__version__}') - torch.manual_seed(42) - - args = parse_arguments() - if not validate_arguments(args): - quit() - - model, model_name, ckpt_name = get_model(args.model) - if args.half: - model = model.half() - if args.gpu: - model = model.to(get_accelerator().current_device_name()) - if args.optimizer: - optimizer = _get_initialized_optimizer(model, args.fused) - ckpt_state = {'model': model, 'optimizer': optimizer} - else: - ckpt_state = {'model': model} - run(ckpt_state, model_name, ckpt_name, args) - - -if __name__ == "__main__": - main() diff --git a/fast_io/model_checkpoint/torch_save_tensor.py b/fast_io/model_checkpoint/torch_save_tensor.py deleted file mode 100644 index 014fdd035..000000000 --- a/fast_io/model_checkpoint/torch_save_tensor.py +++ /dev/null @@ -1,95 +0,0 @@ -import time -import argparse -import torch -import os -from torch_save_utils import PINNED_BUFFER_MB -from torch_save_utils import test_save, test_ds_mock_save, test_ds_py_save, test_ds_aio_fast_save, test_ds_gds_fast_save -import deepspeed -from deepspeed.accelerator import get_accelerator - - -def run(args): - device = get_accelerator().current_device_name() if args.gpu else 'cpu' - buffer = torch.randint(high=128, - size=(args.mb_size * (1024**2), ), - dtype=torch.uint8, - device=device) - - fn_dict = { - # 'test_save': test_save, - # 'test_ds_mock_save': test_ds_mock_save, - # 'test_ds_py_save': test_ds_py_save, - 'test_ds_aio_fast_save': test_ds_aio_fast_save, - 'test_ds_gds_fast_save': test_ds_gds_fast_save - } - for tag, fn in fn_dict.items(): - if tag == 'test_ds_gds_fast_save' and not args.gpu: - continue - file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt') - print(f'checkpoint file = {file}') - if os.path.isfile(file): - os.remove(file) - st = time.time() - write_sec = fn(file, buffer, args) - gb_per_sec = args.mb_size / (1024.0 * write_sec) - gb_size = os.path.getsize(file) / (1024**3) - print( - f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' - ) - print(f'*********************************************') - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--folder', - default=None, - type=str, - required=True, - help='Folder to use for I/O.') - parser.add_argument('--mb_size', - type=int, - default=None, - required=True, - help='Size of tensor to save in MB.') - parser.add_argument('--legacy', - action='store_true', - help='Use torch legacy save format') - - parser.add_argument('--gpu', action='store_true', help='Use gpu tensors.') - - parser.add_argument('--io_buffer_mb', - type=int, - default=PINNED_BUFFER_MB, - help='Size of pinned i/o buffer in MB.') - - parser.add_argument('--no-statistics', - action='store_true', - help='Suppress low-level performance statistics.') - - parser.add_argument('--single_io_buffer', - action='store_true', - help='Disable double buffering of i/o buffer.') - parser.add_argument('--local_rank', - type=int, - default=0, - help='Local rank' ) - - args = parser.parse_args() - print(f'args = {args}') - return args - - -def main(): - print( - f'Performance test of torch.save() integration of fast tensor checkpointing.' - ) - args = parse_arguments() - if not os.path.exists(args.folder): - print(f'Invalid folder: {args.folder}') - quit() - deepspeed.init_distributed() - run(args) - - -if __name__ == "__main__": - main() diff --git a/fast_io/model_checkpoint/torch_save_utils.py b/fast_io/model_checkpoint/torch_save_utils.py deleted file mode 100644 index cf5f2bba5..000000000 --- a/fast_io/model_checkpoint/torch_save_utils.py +++ /dev/null @@ -1,111 +0,0 @@ -import time -import torch -import os -import deepspeed -from deepspeed.ops.op_builder import AsyncIOBuilder, GDSBuilder -from deepspeed.io import MockFileWriter, PyFileWriter, FastFileWriter, FastFileWriterConfig -from deepspeed.accelerator import get_accelerator - -AIO_QUEUE_DEPTH = 8 -AIO_BLOCK_SIZE = 8 * (1024**2) -AIO_INTRA_OP_PARALLEL = 1 -AIO_SINGLE_SUBMIT = False -AIO_OVERLAP_EVENTS = False -PINNED_BUFFER_MB = 64 - - -def _get_aio_handle(): - h = AsyncIOBuilder().load(verbose=False).aio_handle(block_size=AIO_BLOCK_SIZE, - queue_depth=AIO_QUEUE_DEPTH, - single_submit=AIO_SINGLE_SUBMIT, - overlap_events=AIO_SINGLE_SUBMIT, - intra_op_parallelism=AIO_INTRA_OP_PARALLEL) - return h - -def _get_gds_handle(): - h = GDSBuilder().load(verbose=False).gds_handle(block_size=AIO_BLOCK_SIZE, - queue_depth=AIO_QUEUE_DEPTH, - single_submit=AIO_SINGLE_SUBMIT, - overlap_events=AIO_SINGLE_SUBMIT, - intra_op_parallelism=AIO_INTRA_OP_PARALLEL) - return h - -def test_save(file, buffer, args): - st = time.time() - torch.save(f=file, - obj=buffer, - _use_new_zipfile_serialization=not args.legacy) - return time.time() - st - - -def test_ds_mock_save(file, buffer, args): - st = time.time() - ds_mock_writer = MockFileWriter(file) - torch.save(f=ds_mock_writer, - obj=buffer, - _use_new_zipfile_serialization=not args.legacy) - ds_mock_writer.close() # Force flush to storage - write_sec = time.time() - st - if not args.no_statistics: - ds_mock_writer._dump_state() - return write_sec - - -def test_ds_py_save(file, buffer, args): - st = time.time() - ds_py_writer = PyFileWriter(file) - torch.save(f=ds_py_writer, - obj=buffer, - _use_new_zipfile_serialization=not args.legacy) - ds_py_writer.close() # Force flush to storage - write_sec = time.time() - st - if not args.no_statistics: - ds_py_writer._dump_state() - return write_sec - -def _get_aio_components(args): - h = _get_aio_handle() - pinned_memory = torch.zeros(args.io_buffer_mb * (1024**2), - dtype=torch.uint8, - device='cpu').pin_memory() - return h, pinned_memory - -def _get_gds_components(args): - h = _get_gds_handle() - pinned_memory = torch.empty(args.io_buffer_mb * (1024**2), - dtype=torch.uint8, - device=get_accelerator().current_device_name()) - h.pin_device_tensor(pinned_memory) - return h, pinned_memory - - - -def _test_ds_fast_save(file, buffer, args, use_gds): - if use_gds: - h, pinned_memory = _get_gds_components(args) - else: - h, pinned_memory = _get_aio_components(args) - st = time.time() - fast_writer_config = FastFileWriterConfig(dnvme_handle=h, - pinned_tensor=pinned_memory, - double_buffer=not args.single_io_buffer, - num_parallel_writers=1, - writer_rank=0) - - ds_fast_writer = FastFileWriter(file_path=file, - config=fast_writer_config) - torch.save(f=ds_fast_writer, - obj=buffer, - _use_new_zipfile_serialization=not args.legacy) - ds_fast_writer.close() # Force flush to storage - write_sec = time.time() - st - if not args.no_statistics: - ds_fast_writer._dump_state() - return write_sec - - -def test_ds_aio_fast_save(file, buffer, args): - return _test_ds_fast_save(file, buffer, args, False) - -def test_ds_gds_fast_save(file, buffer, args): - return _test_ds_fast_save(file, buffer, args, True) From 1c3a54c8da1ae3f1abf5f85cf015cf5179b5cd62 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 26 Mar 2025 13:35:41 -0400 Subject: [PATCH 32/40] More cleanup Signed-off-by: Olatunji Ruwase --- .../model_checkpoint/checkpoint_compare.py | 123 ------------------ deepnvme/model_checkpoint/torch_save_model.py | 3 + .../model_checkpoint/torch_save_tensor.py | 45 +------ 3 files changed, 5 insertions(+), 166 deletions(-) delete mode 100644 deepnvme/model_checkpoint/checkpoint_compare.py diff --git a/deepnvme/model_checkpoint/checkpoint_compare.py b/deepnvme/model_checkpoint/checkpoint_compare.py deleted file mode 100644 index cc67b61d9..000000000 --- a/deepnvme/model_checkpoint/checkpoint_compare.py +++ /dev/null @@ -1,123 +0,0 @@ -#This script is for testing whether two checkpoints match; it prints all the differences - -import torch -import os -import sys -import pickle -from collections import OrderedDict - -exclude_key_str = {'ds_config/checkpoint/writer'} - -def main(): - dir1 = sys.argv[1] - dir2 = sys.argv[2] - print ("Begin comparison") - print ("The first directory {}" .format(dir1)) - print ("The second directory {}" .format(dir2)) - print (' ') - - file_list1 = [f for f in os.listdir(dir1) if os.path.isfile(os.path.join(dir1, f))] - file_list2 = [f for f in os.listdir(dir2) if os.path.isfile(os.path.join(dir2, f))] - common_files = [] - - for f in file_list1: - if not (f in file_list2): - log_error_file_mismatch_first(f) - else: - common_files.append(f) - for f in file_list2: - if not (f in file_list1): - log_error_file_mismatch_second(f) - - for f in common_files: - full_dir1 = os.path.join(dir1, f) - full_dir2 = os.path.join(dir2, f) - print ("Begin comparison") - print("The first checkpoint {}" .format(full_dir1)) - print("The second checkpoint {}" .format(full_dir2)) - print(' ') - model_first = torch.load(full_dir1) - model_second = torch.load(full_dir2) - object_compare(model_first, model_second, []) - - -def object_compare(model_first, model_second, key_chain): - if not (type(model_first) == type(model_second)): - log_error_value_mismatch(model_first, model_second, key_chain) - return - - if type(model_first) is list: - if len(model_first) != len(model_second): - log_error_value_mismatch(model_first, model_second, key_chain) - return - for i in range(len(model_first)): - object_compare(model_first[i], model_second[i], key_chain) - return - - if type(model_first) is dict or type(model_first) is OrderedDict: - common_keys = [] - for key in model_first: - if key not in model_second: - key_chain.append(key) - log_error_key_mismatch_first(model_first[key], key_chain) - key_chain.pop() - else: - common_keys.append(key) - - for key in model_second: - if key not in model_first: - key_chain.append(key) - log_error_key_mismatch_second(model_second[key], key_chain) - key_chain.pop() - - for key in common_keys: - key_chain.append(key) - object_compare(model_first[key], model_second[key], key_chain) - key_chain.pop() - return - - if hasattr(model_first, '__dict__'): - equality = (model_first.__dict__ == model_second.__dict__) - else: - equality = (model_first == model_second) - if type(equality) is not bool: - equality = (equality.all()) - if not equality: - log_error_value_mismatch(model_first, model_second, key_chain) - return - - -def log_error_file_mismatch_first(filename): - print("The following file appeared in the first but not the second directory: {}" .format(filename)) - print(' ') - - -def log_error_file_mismatch_second(filename): - print("The following key appeared in the second but not the first directory: {}" .format(filename)) - print(" ") - - -def log_error_key_mismatch_first(model, key_chain): - key_str = "/".join(key_chain) - if not (key_str in exclude_key_str): - print("The following key appeared in the first but not the second model: {}" .format(key_str)) - print("The value of the first model is: {}" .format(model)) - print(" ") - - -def log_error_key_mismatch_second(model, key_chain): - key_str = "/".join(key_chain) - if not (key_str in exclude_key_str): - print("The following key appeared in the second but not the first model: {}" .format(key_str)) - print("The value of the second model is: {}" .format(model)) - print(" ") - - -def log_error_value_mismatch(model_first, model_second, key_chain): - print ("The values of the following key do not match: {}" .format("/".join(key_chain))) - print ("The value of the first model is: {}" .format(model_first)) - print ("The value of the second model is: {}" .format(model_second)) - print(" ") - -if __name__ == "__main__": - main() diff --git a/deepnvme/model_checkpoint/torch_save_model.py b/deepnvme/model_checkpoint/torch_save_model.py index 6c1103049..f37d122be 100644 --- a/deepnvme/model_checkpoint/torch_save_model.py +++ b/deepnvme/model_checkpoint/torch_save_model.py @@ -6,6 +6,7 @@ from save_model_utils import get_model, validate_arguments, parse_arguments import deepspeed from deepspeed.accelerator import get_accelerator +import deepspeed.comm as dist def run(model, model_name, ckpt_name, args): @@ -59,6 +60,7 @@ def main(): if not validate_arguments(args): quit() + deepspeed.init_distributed() model, model_name, ckpt_name = get_model(args.model) if args.half: model = model.half() @@ -70,6 +72,7 @@ def main(): else: ckpt_state = {'model': model} run(ckpt_state, model_name, ckpt_name, args) + dist.destroy_process_group() if __name__ == "__main__": diff --git a/deepnvme/model_checkpoint/torch_save_tensor.py b/deepnvme/model_checkpoint/torch_save_tensor.py index 381deebde..4c73a3b2a 100644 --- a/deepnvme/model_checkpoint/torch_save_tensor.py +++ b/deepnvme/model_checkpoint/torch_save_tensor.py @@ -7,50 +7,9 @@ import deepspeed from deepspeed.accelerator import get_accelerator import deepspeed.comm as dist -import multiprocessing as mp import os -FUNC_DICT = { - # 'test_save': test_save, - # 'test_ds_mock_save': test_ds_mock_save, - # 'test_ds_py_save': test_ds_py_save, - 'test_ds_gds_fast_save': test_ds_gds_fast_save, - # 'test_ds_aio_fast_save': test_ds_aio_fast_save, -} - def run(args): - - for tag, fn in FUNC_DICT.items(): - if tag == 'test_ds_gds_fast_save' and not args.gpu: - continue - print(f"launching {tag=} from {os.getpid()=}") - mp.set_start_method('spawn', force=True) - run_save_method(tag, args) - - -def run_save_method(tag, args): - print(f"running {tag=} from {os.getpid()=}") - device = get_accelerator().current_device_name() if args.gpu else 'cpu' - buffer = torch.randint(high=128, - size=(args.mb_size * (1024**2), ), - dtype=torch.uint8, - device=device) - - file = os.path.join(args.folder, f'{tag}_{args.mb_size}MB.pt') - print(f'checkpoint file = {file}') - if os.path.isfile(file): - os.remove(file) - st = time.time() - write_sec = FUNC_DICT[tag](file, buffer, args) - gb_per_sec = args.mb_size / (1024.0 * write_sec) - gb_size = os.path.getsize(file) / (1024**3) - print( - f'{tag} -- {gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s' - ) - print(f'*********************************************') - - -def old_run(args): device = get_accelerator().current_device_name() if args.gpu else 'cpu' buffer = torch.randint(high=128, size=(args.mb_size * (1024**2), ), @@ -61,8 +20,8 @@ def old_run(args): 'test_save': test_save, 'test_ds_mock_save': test_ds_mock_save, 'test_ds_py_save': test_ds_py_save, - # 'test_ds_aio_fast_save': test_ds_aio_fast_save, - 'test_ds_gds_fast_save': test_ds_gds_fast_save + 'test_ds_gds_fast_save': test_ds_gds_fast_save, + 'test_ds_aio_fast_save': test_ds_aio_fast_save, } for tag, fn in fn_dict.items(): if tag == 'test_ds_gds_fast_save' and not args.gpu: From 9a8540b4414c04e8f1c952ac3f93f8a8ef831ce5 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 27 Mar 2025 12:15:07 -0400 Subject: [PATCH 33/40] torch changes Signed-off-by: Olatunji Ruwase --- .../torch/serialization_fast_v2.6.0.py | 1979 +++++++++++++++++ .../torch/serialization_orig_v2.6.0.py | 1975 ++++++++++++++++ 2 files changed, 3954 insertions(+) create mode 100644 deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py create mode 100644 deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py diff --git a/deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py b/deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py new file mode 100644 index 000000000..27b90f0b8 --- /dev/null +++ b/deepnvme/model_checkpoint/torch/serialization_fast_v2.6.0.py @@ -0,0 +1,1979 @@ +# mypy: allow-untyped-defs +import copyreg +import difflib +import functools +import io +import os +import pickle +import re +import shutil +import struct +import sys +import tarfile +import tempfile +import threading +import warnings +from contextlib import closing, contextmanager +from enum import Enum +from typing import ( + Any, + BinaryIO, + Callable, + cast, + Dict, + IO, + List, + Optional, + Tuple, + Type, + Union, +) +from typing_extensions import TypeAlias, TypeIs + +import torch +import torch._weights_only_unpickler as _weights_only_unpickler +from torch._sources import get_source_lines_and_file +from torch._utils import _import_dotted_name +from torch.storage import _get_dtype_from_pickle_storage_type +from torch.types import Storage + + +__all__ = [ + "SourceChangeWarning", + "mkdtemp", + "register_package", + "check_module_version_greater_or_equal", + "validate_cuda_device", + "validate_hpu_device", + "location_tag", + "default_restore_location", + "normalize_storage_type", + "storage_to_tensor_type", + "save", + "load", + "StorageType", + "LoadEndianness", + "get_crc32_options", + "set_crc32_options", + "get_default_load_endianness", + "set_default_load_endianness", + "get_default_mmap_options", + "set_default_mmap_options", + "clear_safe_globals", + "get_safe_globals", + "add_safe_globals", + "safe_globals", + "get_unsafe_globals_in_checkpoint", + "skip_data", +] + +DEFAULT_PROTOCOL = 2 + +LONG_SIZE = struct.Struct("=l").size +INT_SIZE = struct.Struct("=i").size +SHORT_SIZE = struct.Struct("=h").size + +MAGIC_NUMBER = 0x1950A86A20F9469CFC6C +PROTOCOL_VERSION = 1001 +STORAGE_KEY_SEPARATOR = "," + +FILE_LIKE: TypeAlias = Union[str, os.PathLike, BinaryIO, IO[bytes]] +MAP_LOCATION: TypeAlias = Optional[ + Union[Callable[[Storage, str], Storage], torch.device, str, Dict[str, str]] +] +STORAGE: TypeAlias = Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage] + +IS_WINDOWS = sys.platform == "win32" + +UNSAFE_MESSAGE = ( + "In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` " + "from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, " + "but it can result in arbitrary code execution. Do it only if you got the file from a " + "trusted source." +) + +if not IS_WINDOWS: + from mmap import MAP_PRIVATE, MAP_SHARED +else: + MAP_SHARED, MAP_PRIVATE = None, None # type: ignore[assignment] + + +def _default_to_weights_only(pickle_module): + is_fbcode = not hasattr(torch.version, "git_version") + return pickle_module is None and not is_fbcode + + +# _serialization_tls is used to store thread local state specific to serialization +# that needs to be propagated to other files, in particular we use this for +# (1) map_location (needed for wrapper subclasses/third party devices to torch._utils) +# (2) skip_data (needed for torch.Tensor.__reduce_ex__ for skip_data ctx) +# (3) materialize_fake_tensors (needed for torch.Tensor.__reduce_ex__ for skip_data ctx) +class _SerializationLocal(threading.local): + def __init__(self): + super().__init__() + self.map_location: Optional[MAP_LOCATION] = None + self.skip_data: bool = False + self.materialize_fake_tensors: bool = False + + +_serialization_tls = _SerializationLocal() + + +class SourceChangeWarning(Warning): + pass + + +@contextmanager +def mkdtemp(): + path = tempfile.mkdtemp() + try: + yield path + finally: + shutil.rmtree(path) + + +_package_registry: List[ + Tuple[ + int, + Callable[[STORAGE], Optional[str]], + Callable[[STORAGE, str], Optional[STORAGE]], + ] +] = [] + + +class LoadEndianness(Enum): + NATIVE = 1 + LITTLE = 2 + BIG = 3 + + +_default_load_endian: Optional[LoadEndianness] = None + + +def get_default_load_endianness() -> Optional[LoadEndianness]: + """ + Get fallback byte order for loading files + + If byteorder mark is not present in saved checkpoint, + this byte order is used as fallback. + By default, it's "native" byte order. + + Returns: + default_load_endian: Optional[LoadEndianness] + """ + return _default_load_endian + + +def set_default_load_endianness(endianness): + """ + Set fallback byte order for loading files + + If byteorder mark is not present in saved checkpoint, + this byte order is used as fallback. + By default, it's "native" byte order. + + Args: + endianness: the new fallback byte order + """ + global _default_load_endian + if not isinstance(endianness, LoadEndianness) and endianness is not None: + raise TypeError("Invalid argument type in function set_default_load_endianness") + _default_load_endian = endianness + + +_compute_crc32: bool = True + + +def get_crc32_options() -> bool: + """ + Get whether :func:`torch.save` computes and writes crc32 for each record. + + Defaults to ``True``. + """ + return _compute_crc32 + + +def set_crc32_options(compute_crc32: bool): + """ + Set whether :func:`torch.save` computes and writes crc32 for each record. + + .. note:: + Setting this to ``False`` may make unzipping of the ``torch.save`` output + fail or warn due to corrupted CRC32. However ``torch.load`` will be + able to load the file. + + Args: + compute_crc32 (bool): set crc32 compuation flag + """ + global _compute_crc32 + _compute_crc32 = compute_crc32 + + +_default_mmap_options: int = MAP_PRIVATE + + +def get_default_mmap_options() -> int: + """ + Get default mmap options for :func:`torch.load` with ``mmap=True``. + + Defaults to ``mmap.MAP_PRIVATE``. + + + Returns: + default_mmap_options: int + """ + return _default_mmap_options + + +class set_default_mmap_options: + """ + Context manager or function to set default mmap options for :func:`torch.load` with ``mmap=True`` to flags. + + For now, only either ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED`` are supported. + Please open an issue if you need any other option to be added here. + + .. note:: + This feature is currently not supported for Windows. + + Args: + flags: ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED`` + """ + + def __init__(self, flags: int) -> None: + if IS_WINDOWS: + raise RuntimeError( + "Changing the default mmap options is currently not supported for Windows" + ) + if flags != MAP_PRIVATE and flags != MAP_SHARED: + raise ValueError( + "Invalid argument in function set_default_mmap_options, " + f"expected mmap.MAP_PRIVATE or mmap.MAP_SHARED, but got {flags}" + ) + global _default_mmap_options + self.prev = _default_mmap_options + _default_mmap_options = flags + + def __enter__(self) -> None: + pass + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + global _default_mmap_options + _default_mmap_options = self.prev + + +def clear_safe_globals() -> None: + """ + Clears the list of globals that are safe for ``weights_only`` load. + """ + _weights_only_unpickler._clear_safe_globals() + + +def get_safe_globals() -> List[Union[Callable, Tuple[Callable, str]]]: + """ + Returns the list of user-added globals that are safe for ``weights_only`` load. + """ + return _weights_only_unpickler._get_safe_globals() + + +def add_safe_globals(safe_globals: List[Union[Callable, Tuple[Callable, str]]]) -> None: + """ + Marks the given globals as safe for ``weights_only`` load. For example, functions + added to this list can be called during unpickling, classes could be instantiated + and have state set. + + Each item in the list can either be a function/class or a tuple of the form + (function/class, string) where string is the full path of the function/class. + + Within the serialized format, each function is identified with its full + path as ``{__module__}.{__name__}``. When calling this API, you can provide this + full path that should match the one in the checkpoint otherwise the default + ``{fn.__module__}.{fn.__name__}`` will be used. + + Args: + safe_globals (List[Union[Callable, Tuple[Callable, str]]]): list of globals to mark as safe + + Example: + >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization") + >>> import tempfile + >>> class MyTensor(torch.Tensor): + ... pass + >>> t = MyTensor(torch.randn(2, 3)) + >>> with tempfile.NamedTemporaryFile() as f: + ... torch.save(t, f.name) + # Running `torch.load(f.name, weights_only=True)` will fail with + # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default. + # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint. + ... torch.serialization.add_safe_globals([MyTensor]) + ... torch.load(f.name, weights_only=True) + # MyTensor([[-0.5024, -1.8152, -0.5455], + # [-0.8234, 2.0500, -0.3657]]) + """ + _weights_only_unpickler._add_safe_globals(safe_globals) + + +class safe_globals(_weights_only_unpickler._safe_globals): + r"""Context-manager that adds certain globals as safe for ``weights_only`` load. + + Args: + safe_globals: List of globals for weights_only load. + + Example: + >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization") + >>> import tempfile + >>> class MyTensor(torch.Tensor): + ... pass + >>> t = MyTensor(torch.randn(2, 3)) + >>> with tempfile.NamedTemporaryFile() as f: + ... torch.save(t, f.name) + # Running `torch.load(f.name, weights_only=True)` will fail with + # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default. + # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint. + ... with torch.serialization.safe_globals([MyTensor]): + ... torch.load(f.name, weights_only=True) + # MyTensor([[-0.5024, -1.8152, -0.5455], + # [-0.8234, 2.0500, -0.3657]]) + >>> assert torch.serialization.get_safe_globals() == [] + """ + + +def get_unsafe_globals_in_checkpoint(f: FILE_LIKE) -> List[str]: + """Returns a list of strings of functions/classes in a ``torch.save`` object that are not safe for ``weights_only``. + + For a given function or class ``f``, the corresponding string will be of the form + ``{f.__module__}.{f.__name__}``. + + This function will return any GLOBALs in the checkpoint that are not in the set marked safe + for ``weights_only`` (either via :func:`add_safe_globals` or :class:`safe_globals` context or + allowlisted by ``torch`` by default). + + .. note:: + This function will statically disassemble the pickle file in the checkpoint. + The implication is any classes dynamically pushed onto the stack during unpickling + will not be included in the output. + + Args: + f: File-like object or string containing the checkpoint object saved via ``torch.save`` + + Returns: + A list of strings of pickle GLOBALs in the checkpoint that are not allowlisted for ``weights_only``. + """ + default_safe_globals_strings = set( + _weights_only_unpickler._get_allowed_globals().keys() + ) + user_safe_global_strings = set( + _weights_only_unpickler._get_user_allowed_globals().keys() + ) + safe_global_strings = default_safe_globals_strings.union(user_safe_global_strings) + + with _open_file_like(f, "rb") as opened_file: + if not _is_zipfile(opened_file): + raise ValueError("Expected input to be a checkpoint returned by torch.save") + with _open_zipfile_reader(opened_file) as zip_file: + if _is_torchscript_zip(zip_file): + raise ValueError( + "Expected input to be a checkpoint returned by torch.save but got a torchscript checkpoint" + ) + data_file = io.BytesIO(zip_file.get_record("data.pkl")) + all_globals = _weights_only_unpickler.get_globals_in_pkl(data_file) + return list(all_globals.difference(safe_global_strings)) + + +class skip_data: + """ + Context-manager that skips writing storage bytes for ``torch.save`` calls. + + Storages will still be saved, but the space that their bytes would usually be written to + will be empty space. The storage bytes can then be populated in a separate pass. + + .. warning:: + The ``skip_data`` context manager is an early prototype and is subject to change. + + Args: + materialize_fake_tensors: Whether to materialize FakeTensors. + + Example: + >>> # xdoctest: +SKIP("NamedTemporaryFile on Windows") + >>> import tempfile + >>> t = torch.randn(2, 3) + >>> with tempfile.NamedTemporaryFile() as f: + ... with torch.serialization.skip_data(): + ... torch.save(t, f.name) + ... torch.load(f.name, weights_only=True) + tensor([[0., 0., 0.], + [0., 0., 0.]]) + """ + + def __init__(self, materialize_fake_tensors: bool = False): + self.materialize_fake_tensors = materialize_fake_tensors + + def __enter__(self): + global _serialization_tls + self._old_skip_data = _serialization_tls.skip_data + self._old_materialize_fake_tensors = _serialization_tls.materialize_fake_tensors + _serialization_tls.skip_data = True + _serialization_tls.materialize_fake_tensors = self.materialize_fake_tensors + + def __exit__(self, type, value, tb): + global _serialization_tls + _serialization_tls.skip_data = self._old_skip_data + _serialization_tls.materialize_fake_tensors = self._old_materialize_fake_tensors + + +def _is_zipfile(f) -> bool: + # This is a stricter implementation than zipfile.is_zipfile(). + # zipfile.is_zipfile() is True if the magic number appears anywhere in the + # binary. Since we expect the files here to be generated by torch.save or + # torch.jit.save, it's safe to only check the start bytes and avoid + # collisions and assume the zip has only 1 file. + # See bugs.python.org/issue28494. + + start = f.tell() + # Read the first few bytes and match against the ZIP file signature + local_header_magic_number = b"PK\x03\x04" + read_bytes = f.read(len(local_header_magic_number)) + f.seek(start) + return read_bytes == local_header_magic_number + + +def register_package( + priority: int, + tagger: Callable[[STORAGE], Optional[str]], + deserializer: Callable[[STORAGE, str], Optional[STORAGE]], +): + """ + Registers callables for tagging and deserializing storage objects with an associated priority. + Tagging associates a device with a storage object at save time while deserializing moves a + storage object to an appropriate device at load time. :attr:`tagger` and :attr:`deserializer` + are run in the order given by their :attr:`priority` until a tagger/deserializer returns a + value that is not `None`. + + To override the deserialization behavior for a device in the global registry, one can register a + tagger with a higher priority than the existing tagger. + + This function can also be used to register a tagger and deserializer for new devices. + + Args: + priority: Indicates the priority associated with the tagger and deserializer, where a lower + value indicates higher priority. + tagger: Callable that takes in a storage object and returns its tagged device as a string + or None. + deserializer: Callable that takes in storage object and a device string and returns a storage + object on the appropriate device or None. + + Returns: + `None` + + Example: + >>> def ipu_tag(obj): + >>> if obj.device.type == 'ipu': + >>> return 'ipu' + >>> def ipu_deserialize(obj, location): + >>> if location.startswith('ipu'): + >>> ipu = getattr(torch, "ipu", None) + >>> assert ipu is not None, "IPU device module is not loaded" + >>> assert torch.ipu.is_available(), "ipu is not available" + >>> return obj.ipu(location) + >>> torch.serialization.register_package(11, ipu_tag, ipu_deserialize) + """ + queue_elem = (priority, tagger, deserializer) + _package_registry.append(queue_elem) + _package_registry.sort() + + +def check_module_version_greater_or_equal( + module, + req_version_tuple, + error_if_malformed=True, +): + """ + Check if a module's version satisfies requirements + + Usually, a module's version string will be like 'x.y.z', which would be represented + as a tuple (x, y, z), but sometimes it could be an unexpected format. If the version + string does not match the given tuple's format up to the length of the tuple, then + error and exit or emit a warning. + + Args: + module: the module to check the version of + req_version_tuple: tuple (usually of ints) representing the required version + error_if_malformed: whether we should exit if module version string is malformed + + Returns: + requirement_is_met: bool + """ + try: + version_strs = module.__version__.split(".") + # Cast module version fields to match the types of the required version + module_version = tuple( + type(req_field)(version_strs[idx]) + for idx, req_field in enumerate(req_version_tuple) + ) + requirement_is_met = module_version >= req_version_tuple + + except Exception as e: + message = ( + f"'{module.__name__}' module version string is malformed '{module.__version__}' and cannot be compared" + f" with tuple {str(req_version_tuple)}" + ) + if error_if_malformed: + raise RuntimeError(message) from e + else: + warnings.warn(message + ", but continuing assuming that requirement is met") + requirement_is_met = True + + return requirement_is_met + + +def _cpu_tag(obj): + if obj.device.type == "cpu": + return "cpu" + + +def _mps_tag(obj): + if obj.device.type == "mps": + return "mps" + + +def _meta_tag(obj): + if obj.device.type == "meta": + return "meta" + + +def _backend_tag(backend_name, obj): + if backend_name == "privateuse1": + backend_name = torch._C._get_privateuse1_backend_name() + if obj.device.type == backend_name: + if obj.device.index is None: + return backend_name + else: + return backend_name + ":" + str(obj.device.index) + + +def _cpu_deserialize(obj, location): + if location == "cpu": + return obj + + +def _mps_deserialize(obj, location): + if location.startswith("mps"): + return obj.mps() + + +def _meta_deserialize(obj, location): + if location == "meta": + return torch.UntypedStorage(obj.nbytes(), device="meta") + + +def _validate_device(location, backend_name): + """ + Check whether the device index of specified backend is valid + + In case of privateuse1 backend, your must first register a device_module for + privateuse1 using torch._register_device_module. Implement the following + methods in device_module like cuda: device_module._utils._get_device_index(location, True), + device_module.device_count(). + + Args: + location: string of device + backend_name: the backend name or the name of privateuse1, which can be renamed + + Returns: + device_index: int + """ + if not hasattr(torch, backend_name): + raise RuntimeError( + f"The {backend_name.upper()} device module is not registered. " + "If you are running on a CPU-only machine, " + "please use torch.load with map_location=torch.device('cpu') " + "to map your storages to the CPU." + ) + device_module = getattr(torch, backend_name) + if hasattr(device_module, "_utils") and hasattr( + device_module._utils, "_get_device_index" + ): + device_index = device_module._utils._get_device_index(location, True) + device = torch.device(backend_name, device_index) + else: + device = torch.device(location) + device_index = device.index if device.index else 0 + if hasattr(device_module, "is_available") and not device_module.is_available(): + raise RuntimeError( + f"Attempting to deserialize object on a {backend_name.upper()} " + f"device but torch.{backend_name}.is_available() is False. " + "If you are running on a CPU-only machine, " + "please use torch.load with map_location=torch.device('cpu') " + "to map your storages to the CPU." + ) + if hasattr(device_module, "device_count"): + device_count = device_module.device_count() + if device_index >= device_count: + raise RuntimeError( + f"Attempting to deserialize object on {backend_name.upper()} device " + f"{device_index} but torch.{backend_name}.device_count() is {device_count}. " + "Please use torch.load with map_location to map your storages " + "to an existing device." + ) + return device + + +def validate_cuda_device(location): + return _validate_device(location, "cuda").index + + +def validate_hpu_device(location): + return _validate_device(location, "hpu").index + + +def _deserialize(backend_name, obj, location): + if backend_name == "privateuse1": + backend_name = torch._C._get_privateuse1_backend_name() + if location.startswith(backend_name): + device = _validate_device(location, backend_name) + return obj.to(device=device) + + +register_package(10, _cpu_tag, _cpu_deserialize) +register_package( + 20, + functools.partial(_backend_tag, "cuda"), + functools.partial(_deserialize, "cuda"), +) +register_package(21, _mps_tag, _mps_deserialize) +register_package(22, _meta_tag, _meta_deserialize) +register_package( + 23, + functools.partial(_backend_tag, "privateuse1"), + functools.partial(_deserialize, "privateuse1"), +) +register_package( + 24, + functools.partial(_backend_tag, "hpu"), + functools.partial(_deserialize, "hpu"), +) +register_package( + 25, + functools.partial(_backend_tag, "xpu"), + functools.partial(_deserialize, "xpu"), +) + + +def location_tag( + storage: Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage], +): + for _, tagger, _ in _package_registry: + location = tagger(storage) + if location: + return location + raise RuntimeError( + "don't know how to determine data location of " + torch.typename(storage) + ) + + +def default_restore_location(storage, location): + """ + Restores `storage` using a deserializer function registered for the `location`. + + This function looks in the registry for deserializer functions that match the `location`. + If found, it attempts to use them, in priority order, to restore `storage` until one + returns a not `None` result. If no deserializer can be found in the registry, or all found fail + to bear a result, it raises a `RuntimeError`. + + Args: + storage (STORAGE): the storage object to restore + location (str): the location tag associated with the storage object + + Returns: + storage: Optional[STORAGE] + + Raises: + RuntimeError: If no deserializer matching `location` is found in the registry or if + all matching ones return `None`. + """ + for _, _, fn in _package_registry: + result = fn(storage, location) + if result is not None: + return result + raise RuntimeError( + "don't know how to restore data location of " + + torch.typename(storage) + + " (tagged with " + + location + + ")" + ) + + +def normalize_storage_type(storage_type): + return getattr(torch, storage_type.__name__) + + +def storage_to_tensor_type(storage): + storage_type = type(storage) + module = _import_dotted_name(storage_type.__module__) + return getattr(module, storage_type.__name__.replace("Storage", "Tensor")) + + +def _is_path(name_or_buffer) -> TypeIs[Union[str, os.PathLike]]: + return isinstance(name_or_buffer, (str, os.PathLike)) + + +class _opener: + def __init__(self, file_like): + self.file_like = file_like + + def __enter__(self): + return self.file_like + + def __exit__(self, *args): + pass + + +class _open_file(_opener): + def __init__(self, name, mode): + super().__init__(open(name, mode)) + + def __exit__(self, *args): + self.file_like.close() + + +class _open_buffer_reader(_opener): + def __init__(self, buffer): + super().__init__(buffer) + _check_seekable(buffer) + + +class _open_buffer_writer(_opener): + def __exit__(self, *args): + self.file_like.flush() + + +def _open_file_like(name_or_buffer, mode): + if _is_path(name_or_buffer): + return _open_file(name_or_buffer, mode) + else: + if "w" in mode: + return _open_buffer_writer(name_or_buffer) + elif "r" in mode: + return _open_buffer_reader(name_or_buffer) + else: + raise RuntimeError(f"Expected 'r' or 'w' in mode but got {mode}") + + +class _open_zipfile_reader(_opener): + def __init__(self, name_or_buffer) -> None: + super().__init__(torch._C.PyTorchFileReader(name_or_buffer)) + + +class _open_zipfile_writer_file(_opener): + def __init__(self, name) -> None: + self.file_stream = None + self.name = str(name) + try: + self.name.encode("ascii") + except UnicodeEncodeError: + # PyTorchFileWriter only supports ascii filename. + # For filenames with non-ascii characters, we rely on Python + # for writing out the file. + self.file_stream = io.FileIO(self.name, mode="w") + super().__init__( + torch._C.PyTorchFileWriter(self.file_stream, _compute_crc32) + ) + else: + super().__init__(torch._C.PyTorchFileWriter(self.name, _compute_crc32)) + + def __exit__(self, *args) -> None: + self.file_like.write_end_of_file() + if self.file_stream is not None: + self.file_stream.close() + + +class _open_zipfile_writer_buffer(_opener): + def __init__(self, buffer) -> None: + if not callable(getattr(buffer, "write", None)): + msg = f"Buffer of {str(type(buffer)).strip('<>')} has no callable attribute 'write'" + if not hasattr(buffer, "write"): + raise AttributeError(msg) + raise TypeError(msg) + self.buffer = buffer + super().__init__(torch._C.PyTorchFileWriter(buffer, _compute_crc32)) + + def __exit__(self, *args) -> None: + self.file_like.write_end_of_file() + self.buffer.flush() + + +def _open_zipfile_writer(name_or_buffer): + container: Type[_opener] + if _is_path(name_or_buffer): + container = _open_zipfile_writer_file + else: + container = _open_zipfile_writer_buffer + return container(name_or_buffer) + + +def _is_compressed_file(f) -> bool: + compress_modules = ["gzip"] + try: + return f.__module__ in compress_modules + except AttributeError: + return False + + +def _should_read_directly(f): + """ + Checks if f is a file that should be read directly. It should be read + directly if it is backed by a real file (has a fileno) and is not a + a compressed file (e.g. gzip) + """ + if _is_compressed_file(f): + return False + try: + return f.fileno() >= 0 + except io.UnsupportedOperation: + return False + except AttributeError: + return False + + +def _check_seekable(f) -> bool: + def raise_err_msg(patterns, e): + for p in patterns: + if p in str(e): + msg = ( + str(e) + + ". You can only torch.load from a file that is seekable." + + " Please pre-load the data into a buffer like io.BytesIO and" + + " try to load from it instead." + ) + raise type(e)(msg) + raise e + + try: + f.seek(f.tell()) + return True + except (io.UnsupportedOperation, AttributeError) as e: + raise_err_msg(["seek", "tell"], e) + return False + + +def _check_dill_version(pickle_module) -> None: + """Checks if using dill as the pickle module, and if so, checks if it is the correct version. + If dill version is lower than 0.3.1, a ValueError is raised. + + Args: + pickle_module: module used for pickling metadata and objects + + """ + if pickle_module is not None and pickle_module.__name__ == "dill": + required_dill_version = (0, 3, 1) + if not check_module_version_greater_or_equal( + pickle_module, required_dill_version, False + ): + raise ValueError( + ( + "'torch' supports dill >= {}, but you have dill {}." + " Please upgrade dill or switch to 'pickle'" + ).format( + ".".join([str(num) for num in required_dill_version]), + pickle_module.__version__, + ) + ) + + +def _check_save_filelike(f): + if not _is_path(f) and not hasattr(f, "write"): + raise AttributeError( + "expected 'f' to be string, path, or a file-like object with " + "a 'write' attribute" + ) + + +def save( + obj: object, + f: FILE_LIKE, + pickle_module: Any = pickle, + pickle_protocol: int = DEFAULT_PROTOCOL, + _use_new_zipfile_serialization: bool = True, + _disable_byteorder_record: bool = False, +) -> None: + # Reference: https://github.com/pytorch/pytorch/issues/54354 + # The first line of this docstring overrides the one Sphinx generates for the + # documentation. We need it so that Sphinx doesn't leak `pickle`s path from + # the build environment (e.g. `>> # xdoctest: +SKIP("makes cwd dirty") + >>> # Save to file + >>> x = torch.tensor([0, 1, 2, 3, 4]) + >>> torch.save(x, "tensor.pt") + >>> # Save to io.BytesIO buffer + >>> buffer = io.BytesIO() + >>> torch.save(x, buffer) + """ + torch._C._log_api_usage_once("torch.save") + _check_dill_version(pickle_module) + _check_save_filelike(f) + + if _use_new_zipfile_serialization: + with _open_zipfile_writer(f) as opened_zipfile: + _save( + obj, + opened_zipfile, + pickle_module, + pickle_protocol, + _disable_byteorder_record, + ) + return + else: + global _serialization_tls + if _serialization_tls.skip_data: + raise RuntimeError( + "Cannot use skip_data=True with _use_new_zipfile_serialization=False" + ) + with _open_file_like(f, "wb") as opened_file: + _legacy_save(obj, opened_file, pickle_module, pickle_protocol) + + +def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None: + import torch.nn as nn + + serialized_container_types = {} + serialized_storages: Dict[str, Tuple[torch.UntypedStorage, torch.dtype]] = {} + + # Since loading storages that view the same data with different dtypes is + # not supported, we need to keep track of the dtype associated with each + # storage data_ptr and throw an error if the dtype is ever different. + # TODO: This feature could be added in the future + storage_dtypes: Dict[int, torch.dtype] = {} + + def persistent_id(obj: Any) -> Optional[Tuple]: + # FIXME: the docs say that persistent_id should only return a string + # but torch store returns tuples. This works only in the binary protocol + # see + # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects + # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 + if isinstance(obj, type) and issubclass(obj, nn.Module): + if obj in serialized_container_types: + return None + serialized_container_types[obj] = True + source_file = source = None + try: + source_lines, _, source_file = get_source_lines_and_file(obj) + source = "".join(source_lines) + except ( + Exception + ): # saving the source is optional, so we can ignore any errors + warnings.warn( + "Couldn't retrieve source code for container of " + "type " + obj.__name__ + ". It won't be checked " + "for correctness upon loading." + ) + return ("module", obj, source_file, source) + + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + storage: torch.UntypedStorage + + if isinstance(obj, torch.storage.TypedStorage): + # TODO: Once we decide to break serialization FC, this case + # can be deleted + storage = obj._untyped_storage + storage_dtype = obj.dtype + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + dtype = obj.dtype + storage_numel = obj._size() + + elif isinstance(obj, torch.UntypedStorage): + storage = obj + storage_dtype = torch.uint8 + storage_type = normalize_storage_type(type(obj)) + dtype = torch.uint8 + storage_numel = storage.nbytes() + else: + raise TypeError(f"type not recognized: {type(obj)}") + + # If storage is allocated, ensure that any other saved storages + # pointing to the same data all have the same dtype. If storage is + # not allocated, don't perform this check + if storage.data_ptr() != 0: + if storage.data_ptr() in storage_dtypes: + if storage_dtype != storage_dtypes[storage.data_ptr()]: + raise RuntimeError( + "Cannot save multiple tensors or storages that " + "view the same data as different types" + ) + else: + storage_dtypes[storage.data_ptr()] = storage_dtype + + view_metadata: Optional[Tuple[str, int, int]] + + # Offset is always 0, but we keep it for backwards compatibility + # with the old serialization format (which supported storage views) + offset = 0 + storage_key = str(storage._cdata) + location = location_tag(storage) + + # TODO: There's an issue here with FC. It might be impossible to + # solve, but it's worth noting. Imagine we save a list `[storage, + # tensor]`, where `tensor.storage()` is the same as `storage`, and + # `tensor.element_size() > 1`. Let's say that `tensor.dtype == + # torch.float`. The storage will be serialized with element size + # of 1, since we're choosing to serialize the first occurance of + # a duplicate storage. Since this legacy serialization format saves + # the numel of the storage, rather than nbytes directly, we'll be + # effectively saving nbytes in this case. We'll be able to load it + # and the tensor back up with no problems in _this_ and future + # versions of pytorch, but in older versions, here's the problem: + # the storage will be loaded up as a UntypedStorage, and then the + # FloatTensor will loaded and the UntypedStorage will be assigned to + # it. Since the storage dtype does not match the tensor dtype, this + # will cause an error. If we reverse the list, like `[tensor, + # storage]`, then we will save the `tensor.storage()` as a faked + # `FloatStorage`, and the saved size will be the correct + # dtype-specific numel count that old versions expect. `tensor` + # will be able to load up properly in old versions, pointing to + # a FloatStorage. However, `storage` is still being translated to + # a UntypedStorage, and it will try to resolve to the same + # FloatStorage that `tensor` contains. This will also cause an + # error. It doesn't seem like there's any way around this. + # Probably, we just cannot maintain FC for the legacy format if the + # saved list contains both a tensor and a storage that point to the + # same data. We should still be able to maintain FC for lists of + # just tensors, as long as all views share the same dtype as the + # tensor they are viewing. + + if storage_key not in serialized_storages: + serialized_storages[storage_key] = (storage, dtype) + is_view = storage._cdata != storage._cdata + if is_view: + view_metadata = (str(storage._cdata), offset, storage.nbytes()) + else: + view_metadata = None + + res = ( + "storage", + storage_type, + storage_key, + location, + storage_numel, + view_metadata, + ) + return res + return None + + sys_info = dict( + protocol_version=PROTOCOL_VERSION, + little_endian=sys.byteorder == "little", + type_sizes=dict( + short=SHORT_SIZE, + int=INT_SIZE, + long=LONG_SIZE, + ), + ) + + pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol) + pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol) + pickle_module.dump(sys_info, f, protocol=pickle_protocol) + + class PyTorchLegacyPickler(pickle_module.Pickler): + def persistent_id(self, obj): + return persistent_id(obj) + + pickler = PyTorchLegacyPickler(f, protocol=pickle_protocol) + pickler.dump(obj) + + serialized_storage_keys = sorted(serialized_storages.keys()) + pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol) + f.flush() + if hasattr(f, 'save_torch_storage_object_list'): + sorted_storage_objects = [serialized_storages[key] for key in serialized_storage_keys] + f.save_torch_storage_object_list(sorted_storage_objects, True) + else: + for key in serialized_storage_keys: + storage, dtype = serialized_storages[key] + storage._write_file( + f, _should_read_directly(f), True, torch._utils._element_size(dtype) + ) + + +def _save( + obj, + zip_file, + pickle_module, + pickle_protocol, + _disable_byteorder_record, +): + serialized_storages = {} + id_map: Dict[int, str] = {} + + # Since loading storages that view the same data with different dtypes is + # not supported, we need to keep track of the dtype associated with each + # storage data_ptr and throw an error if the dtype is ever different. + # TODO: This feature could be added in the future + storage_dtypes: Dict[int, torch.dtype] = {} + + def persistent_id(obj): + # FIXME: the docs say that persistent_id should only return a string + # but torch store returns tuples. This works only in the binary protocol + # see + # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects + # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + if isinstance(obj, torch.storage.TypedStorage): + # TODO: Once we decide to break serialization FC, this case + # can be deleted + storage = obj._untyped_storage + storage_dtype = obj.dtype + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + storage_numel = obj._size() + + else: + storage = obj + storage_dtype = torch.uint8 + storage_type = normalize_storage_type(type(obj)) + storage_numel = storage.nbytes() + + # If storage is allocated, ensure that any other saved storages + # pointing to the same data all have the same dtype. If storage is + # not allocated, don't perform this check + if str(storage.device) != "meta" and storage.data_ptr() != 0: + if storage.data_ptr() in storage_dtypes: + if storage_dtype != storage_dtypes[storage.data_ptr()]: + raise RuntimeError( + "Cannot save multiple tensors or storages that " + "view the same data as different types" + ) + else: + storage_dtypes[storage.data_ptr()] = storage_dtype + + storage_key = id_map.setdefault(storage._cdata, str(len(id_map))) + if hasattr(obj, "_fake_device") and obj._fake_device is not None: + location = str(obj._fake_device) + else: + location = location_tag(storage) + serialized_storages[storage_key] = storage + + return ("storage", storage_type, storage_key, location, storage_numel) + + return None + + # Write the pickle data for `obj` + data_buf = io.BytesIO() + + class PyTorchPickler(pickle_module.Pickler): # type: ignore[name-defined] + def persistent_id(self, obj): + return persistent_id(obj) + + pickler = PyTorchPickler(data_buf, protocol=pickle_protocol) + pickler.dump(obj) + data_value = data_buf.getvalue() + zip_file.write_record("data.pkl", data_value, len(data_value)) + + # Write byte order marker + if not _disable_byteorder_record: + if sys.byteorder not in ["little", "big"]: + raise ValueError("Unknown endianness type: " + sys.byteorder) + + zip_file.write_record("byteorder", sys.byteorder, len(sys.byteorder)) + + # Write each tensor to a file named tensor/the_tensor_key in the zip archive + for key in sorted(serialized_storages.keys()): + name = f"data/{key}" + storage = serialized_storages[key] + num_bytes = storage.nbytes() + global _serialization_tls + if _serialization_tls.skip_data: + zip_file.write_record_metadata(name, num_bytes) + else: + # given that we copy things around anyway, we might use storage.cpu() + # this means to that to get tensors serialized, you need to implement + # .cpu() on the underlying Storage + if storage.device.type != "cpu": + storage = storage.cpu() + # Now that it is on the CPU we can directly copy it into the zip file + zip_file.write_record(name, storage, num_bytes) + + +def load( + f: FILE_LIKE, + map_location: MAP_LOCATION = None, + pickle_module: Any = None, + *, + weights_only: Optional[bool] = None, + mmap: Optional[bool] = None, + **pickle_load_args: Any, +) -> Any: + # Reference: https://github.com/pytorch/pytorch/issues/54354 + # The first line of this docstring overrides the one Sphinx generates for the + # documentation. We need it so that Sphinx doesn't leak `pickle`s path from + # the build environment (e.g. `>> # xdoctest: +SKIP("undefined filepaths") + >>> torch.load("tensors.pt", weights_only=True) + # Load all tensors onto the CPU + >>> torch.load("tensors.pt", map_location=torch.device("cpu"), weights_only=True) + # Load all tensors onto the CPU, using a function + >>> torch.load( + ... "tensors.pt", map_location=lambda storage, loc: storage, weights_only=True + ... ) + # Load all tensors onto GPU 1 + >>> torch.load( + ... "tensors.pt", + ... map_location=lambda storage, loc: storage.cuda(1), + ... weights_only=True, + ... ) # type: ignore[attr-defined] + # Map tensors from GPU 1 to GPU 0 + >>> torch.load("tensors.pt", map_location={"cuda:1": "cuda:0"}, weights_only=True) + # Load tensor from io.BytesIO object + # Loading from a buffer setting weights_only=False, warning this can be unsafe + >>> with open("tensor.pt", "rb") as f: + ... buffer = io.BytesIO(f.read()) + >>> torch.load(buffer, weights_only=False) + # Load a module with 'ascii' encoding for unpickling + # Loading from a module setting weights_only=False, warning this can be unsafe + >>> torch.load("module.pt", encoding="ascii", weights_only=False) + """ + torch._C._log_api_usage_once("torch.load") + DOCS_MESSAGE = ( + "\n\nCheck the documentation of torch.load to learn more about types accepted by default with " + "weights_only https://pytorch.org/docs/stable/generated/torch.load.html." + ) + + def _get_wo_message(message: str) -> str: + unsafe_global_pattern = r"GLOBAL (\S+) was not an allowed global by default." + has_unsafe_global = re.search(unsafe_global_pattern, message) is not None + blocklist_pattern = r"whose module (\S+) is blocked" + has_blocklist = re.search(blocklist_pattern, message) is not None + import_pattern = r"(\S+) must be (\S+) to load" + has_import = re.search(import_pattern, message) is not None + if has_unsafe_global: + updated_message = ( + "Weights only load failed. This file can still be loaded, to do so you have two options, " + "\033[1mdo those steps only if you trust the source of the checkpoint\033[0m. " + f"\n\t(1) {UNSAFE_MESSAGE}\n\t(2) Alternatively, to load with `weights_only=True` please check " + "the recommended steps in the following error message.\n\tWeightsUnpickler error: " + + message + ) + else: + if has_import: + return f"Weights only load failed. {message}\n {UNSAFE_MESSAGE}\n" + else: + updated_message = f"Weights only load failed. {UNSAFE_MESSAGE}\n" + if not has_blocklist: + updated_message += ( + "Please file an issue with the following so that we can make " + "`weights_only=True` compatible with your use case: WeightsUnpickler error: " + ) + updated_message += message + return updated_message + DOCS_MESSAGE + + global _serialization_tls + skip_data = _serialization_tls.skip_data + if skip_data: + raise RuntimeError( + "`torch.load` called within a torch.serialization.skip_data context manager " + "is not supported yet. Please call torch.load outside the skip_data context manager." + ) + + weights_only_not_set = weights_only is None + + if weights_only_not_set: + weights_only = _default_to_weights_only(pickle_module) + + true_values = ["1", "y", "yes", "true"] + # Add ability to force safe only or non-safe weight loads via environment variables + force_weights_only_load = ( + os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0") in true_values + ) + force_no_weights_only_load = ( + os.getenv("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "0") in true_values + ) + + if force_weights_only_load and force_no_weights_only_load: + raise RuntimeError( + "Only one of `TORCH_FORCE_WEIGHTS_ONLY_LOAD` or `TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD` " + "should be set, but both were set." + ) + elif force_weights_only_load: + weights_only = True + elif force_no_weights_only_load: + # TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD can only override if callsite did not explicitly set weights_only + if weights_only_not_set: + warnings.warn( + "Environment variable TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD detected, since the" + "`weights_only` argument was not explicitly passed to `torch.load`, forcing weights_only=False.", + UserWarning, + stacklevel=2, + ) + weights_only = False + + if weights_only: + if pickle_module is not None: + raise RuntimeError( + "Can not safely load weights when explicit pickle_module is specified" + ) + else: + if pickle_module is None: + pickle_module = pickle + + # make flipping default BC-compatible + if mmap is None: + mmap = False + + _check_dill_version(pickle_module) + + if "encoding" not in pickle_load_args.keys(): + pickle_load_args["encoding"] = "utf-8" + + with _open_file_like(f, "rb") as opened_file: + if _is_zipfile(opened_file): + # The zipfile reader is going to advance the current file position. + # If we want to actually tail call to torch.jit.load, we need to + # reset back to the original position. + orig_position = opened_file.tell() + overall_storage = None + with _open_zipfile_reader(opened_file) as opened_zipfile: + if _is_torchscript_zip(opened_zipfile): + warnings.warn( + "'torch.load' received a zip file that looks like a TorchScript archive" + " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to" + " silence this warning)", + UserWarning, + ) + if weights_only: + raise RuntimeError( + "Cannot use ``weights_only=True`` with TorchScript archives passed to " + "``torch.load``. " + UNSAFE_MESSAGE + ) + opened_file.seek(orig_position) + return torch.jit.load(opened_file, map_location=map_location) + if mmap: + if not _is_path(f): + raise ValueError( + "f must be a file path in order to use the mmap argument" + ) + size = os.path.getsize(f) + if not IS_WINDOWS: + shared = get_default_mmap_options() == MAP_SHARED + else: + shared = False + overall_storage = torch.UntypedStorage.from_file( + os.fspath(f), shared, size + ) + if weights_only: + try: + return _load( + opened_zipfile, + map_location, + _weights_only_unpickler, + overall_storage=overall_storage, + **pickle_load_args, + ) + except pickle.UnpicklingError as e: + raise pickle.UnpicklingError(_get_wo_message(str(e))) from None + return _load( + opened_zipfile, + map_location, + pickle_module, + overall_storage=overall_storage, + **pickle_load_args, + ) + if mmap: + f_name = "" if not isinstance(f, str) else f"{f}, " + raise RuntimeError( + "mmap can only be used with files saved with " + f"`torch.save({f_name}_use_new_zipfile_serialization=True), " + "please torch.save your checkpoint with this option in order to use mmap." + ) + if weights_only: + try: + return _legacy_load( + opened_file, + map_location, + _weights_only_unpickler, + **pickle_load_args, + ) + except pickle.UnpicklingError as e: + raise pickle.UnpicklingError(_get_wo_message(str(e))) from None + return _legacy_load( + opened_file, map_location, pickle_module, **pickle_load_args + ) + + +# Register pickling support for layout instances such as +# torch.sparse_coo, etc +def _get_layout(name): + """Get layout extension object from its string representation.""" + cache = _get_layout.cache # type: ignore[attr-defined] + if not cache: + for v in torch.__dict__.values(): + if isinstance(v, torch.layout): + cache[str(v)] = v + return cache[name] + + +# There are yet not good way to type annotate function attributes https://github.com/python/mypy/issues/2087 +_get_layout.cache = {} # type: ignore[attr-defined] +copyreg.pickle(torch.layout, lambda obj: (_get_layout, (str(obj),))) + + +def _legacy_load(f, map_location, pickle_module, **pickle_load_args): + deserialized_objects: Dict[int, Any] = {} + + restore_location = _get_restore_location(map_location) + + class UnpicklerWrapper(pickle_module.Unpickler): # type: ignore[name-defined] + def find_class(self, mod_name, name): + if type(name) is str and "Storage" in name: + try: + return StorageType(name) + except KeyError: + pass + return super().find_class(mod_name, name) + + def _check_container_source(container_type, source_file, original_source): + try: + current_source = "".join(get_source_lines_and_file(container_type)[0]) + except Exception: # saving the source is optional, so we can ignore any errors + warnings.warn( + "Couldn't retrieve source code for container of " + "type " + container_type.__name__ + ". It won't be checked " + "for correctness upon loading." + ) + return + if original_source != current_source: + if container_type.dump_patches: + file_name = container_type.__name__ + ".patch" + diff = difflib.unified_diff( + current_source.split("\n"), + original_source.split("\n"), + source_file, + source_file, + lineterm="", + ) + lines = "\n".join(diff) + try: + with open(file_name, "a+") as f: + file_size = f.seek(0, 2) + f.seek(0) + if file_size == 0: + f.write(lines) + elif file_size != len(lines) or f.read() != lines: + raise OSError + msg = ( + "Saved a reverse patch to " + file_name + ". " + "Run `patch -p0 < " + file_name + "` to revert your " + "changes." + ) + except OSError: + msg = ( + "Tried to save a patch, but couldn't create a " + "writable file " + file_name + ". Make sure it " + "doesn't exist and your working directory is " + "writable." + ) + else: + msg = ( + "you can retrieve the original source code by " + "accessing the object's source attribute or set " + "`torch.nn.Module.dump_patches = True` and use the " + "patch tool to revert the changes." + ) + msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}" + warnings.warn(msg, SourceChangeWarning) + + def legacy_load(f): + deserialized_objects: Dict[int, Any] = {} + + def persistent_load(saved_id): + if isinstance(saved_id, tuple): + # Ignore containers that don't have any sources saved + if all(saved_id[1:]): + _check_container_source(*saved_id) + return saved_id[0] + return deserialized_objects[int(saved_id)] + + with closing( + tarfile.open(fileobj=f, mode="r:", format=tarfile.PAX_FORMAT) + ) as tar, mkdtemp() as tmpdir: + if pickle_module is _weights_only_unpickler: + raise RuntimeError( + "Cannot use ``weights_only=True`` with files saved in the " + "legacy .tar format. " + UNSAFE_MESSAGE + ) + tar.extract("storages", path=tmpdir) + with open(os.path.join(tmpdir, "storages"), "rb", 0) as f: + num_storages = pickle_module.load(f, **pickle_load_args) + for _ in range(num_storages): + args = pickle_module.load(f, **pickle_load_args) + key, location, storage_type = args + dtype = storage_type._dtype + obj = cast(Storage, torch.UntypedStorage)._new_with_file( + f, torch._utils._element_size(dtype) + ) + obj = restore_location(obj, location) + # TODO: Once we decide to break serialization FC, we can + # stop wrapping with TypedStorage + deserialized_objects[key] = torch.storage.TypedStorage( + wrap_storage=obj, dtype=dtype, _internal=True + ) + + storage_views = pickle_module.load(f, **pickle_load_args) + for target_cdata, root_cdata, offset, numel in storage_views: + root = deserialized_objects[root_cdata] + element_size = torch._utils._element_size(root.dtype) + offset_bytes = offset * element_size + # TODO: Once we decide to break serialization FC, we can + # stop wrapping with TypedStorage + deserialized_objects[target_cdata] = torch.storage.TypedStorage( + wrap_storage=root._untyped_storage[ + offset_bytes : offset_bytes + numel * element_size + ], + dtype=root.dtype, + _internal=True, + ) + + tar.extract("tensors", path=tmpdir) + with open(os.path.join(tmpdir, "tensors"), "rb", 0) as f: + num_tensors = pickle_module.load(f, **pickle_load_args) + for _ in range(num_tensors): + args = pickle_module.load(f, **pickle_load_args) + key, storage_id, _original_tensor_type = args + storage = deserialized_objects[storage_id] + (ndim,) = struct.unpack(" str: + # When using encoding='bytes' in Py3, some **internal** keys stored as + # strings in Py2 are loaded as bytes. This function decodes them with + # ascii encoding, one that Py3 uses by default. + # + # NOTE: This should only be used on internal keys (e.g., `typename` and + # `location` in `persistent_load` below! + if isinstance(bytes_str, bytes): + return bytes_str.decode("ascii") + return bytes_str + + +def _get_restore_location(map_location): + if map_location is None: + restore_location = default_restore_location + elif isinstance(map_location, dict): + + def restore_location(storage, location): + location = map_location.get(location, location) + return default_restore_location(storage, location) + + elif isinstance(map_location, (str, bytes)): + + def restore_location(storage, location): + return default_restore_location(storage, map_location) + + elif isinstance(map_location, torch.device): + + def restore_location(storage, location): + return default_restore_location(storage, str(map_location)) + + else: + + def restore_location(storage, location): + result = map_location(storage, location) + if result is None: + result = default_restore_location(storage, location) + return result + + return restore_location + + +class StorageType: + def __init__(self, name): + self._dtype = _get_dtype_from_pickle_storage_type(name) + + @property + def dtype(self): + return self._dtype + + def __str__(self): + return f"StorageType(dtype={self.dtype})" + + +def _load( + zip_file, + map_location, + pickle_module, + pickle_file="data.pkl", + overall_storage=None, + **pickle_load_args, +): + restore_location = _get_restore_location(map_location) + + loaded_storages = {} + + # check if byteswapping is needed + byteordername = "byteorder" + byteorderdata = None + if zip_file.has_record(byteordername): + byteorderdata = zip_file.get_record(byteordername) + if byteorderdata not in [b"little", b"big"]: + raise ValueError("Unknown endianness type: " + byteorderdata.decode()) + elif ( + get_default_load_endianness() == LoadEndianness.LITTLE + or get_default_load_endianness() is None + ): + byteorderdata = b"little" + elif get_default_load_endianness() == LoadEndianness.BIG: + byteorderdata = b"big" + elif get_default_load_endianness() == LoadEndianness.NATIVE: + pass + else: + raise ValueError("Invalid load endianness type") + + if ( + not zip_file.has_record(byteordername) + and get_default_load_endianness() is None + and sys.byteorder == "big" + ): + # Default behaviour was changed + # See https://github.com/pytorch/pytorch/issues/101688 + warnings.warn( + "The default load endianness for checkpoints without a byteorder mark " + "on big endian machines was changed from 'native' to 'little' endian, " + "to avoid this behavior please use " + "torch.serialization.set_default_load_endianness to set " + "the desired default load endianness", + UserWarning, + ) + + def load_tensor(dtype, numel, key, location): + name = f"data/{key}" + if torch._guards.detect_fake_mode(None) is not None: + nbytes = numel * torch._utils._element_size(dtype) + storage = torch.UntypedStorage(nbytes, device="meta") + elif overall_storage is not None: + storage_offset = zip_file.get_record_offset(name) + storage = overall_storage[storage_offset : storage_offset + numel] + else: + storage = ( + zip_file.get_storage_from_record(name, numel, torch.UntypedStorage) + ._typed_storage() + ._untyped_storage + ) + # swap here if byteswapping is needed + if byteorderdata is not None: + if byteorderdata.decode() != sys.byteorder: + storage.byteswap(dtype) + + # TODO: Once we decide to break serialization FC, we can + # stop wrapping with TypedStorage + typed_storage = torch.storage.TypedStorage( + wrap_storage=restore_location(storage, location), + dtype=dtype, + _internal=True, + ) + + if typed_storage._data_ptr() != 0: + loaded_storages[key] = typed_storage + + return typed_storage + + def persistent_load(saved_id): + assert isinstance(saved_id, tuple) + typename = _maybe_decode_ascii(saved_id[0]) + data = saved_id[1:] + + assert ( + typename == "storage" + ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'" + storage_type, key, location, numel = data + if storage_type is torch.UntypedStorage: + dtype = torch.uint8 + else: + dtype = storage_type.dtype + + if key in loaded_storages: + typed_storage = loaded_storages[key] + else: + nbytes = numel * torch._utils._element_size(dtype) + typed_storage = load_tensor( + dtype, nbytes, key, _maybe_decode_ascii(location) + ) + + return typed_storage + + load_module_mapping: Dict[str, str] = { + # See https://github.com/pytorch/pytorch/pull/51633 + "torch.tensor": "torch._tensor" + } + + # Need to subclass Unpickler instead of directly monkey-patching the find_class method + # because it's marked readonly in pickle. + # The type: ignore is because mypy can't statically determine the type of this class. + class UnpicklerWrapper(pickle_module.Unpickler): # type: ignore[name-defined] + # from https://stackoverflow.com/questions/13398462/unpickling-python-objects-with-a-changed-module-path/13405732 + # Lets us override the imports that pickle uses when unpickling an object. + # This is useful for maintaining BC if we change a module path that tensor instantiation relies on. + def find_class(self, mod_name, name): + if type(name) is str and "Storage" in name: + try: + return StorageType(name) + except KeyError: + pass + mod_name = load_module_mapping.get(mod_name, mod_name) + return super().find_class(mod_name, name) + + # Load the data (which may in turn use `persistent_load` to load tensors) + data_file = io.BytesIO(zip_file.get_record(pickle_file)) + + unpickler = UnpicklerWrapper(data_file, **pickle_load_args) + unpickler.persistent_load = persistent_load + # Needed for tensors where storage device and rebuild tensor device are + # not connected (wrapper subclasses and tensors rebuilt using numpy) + global _serialization_tls + _serialization_tls.map_location = map_location + result = unpickler.load() + _serialization_tls.map_location = None + + torch._utils._validate_loaded_sparse_tensors() + torch._C._log_api_usage_metadata( + "torch.load.metadata", {"serialization_id": zip_file.serialization_id()} + ) + return result + + +def _is_torchscript_zip(zip_file): + return "constants.pkl" in zip_file.get_all_records() diff --git a/deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py b/deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py new file mode 100644 index 000000000..ad23e92cc --- /dev/null +++ b/deepnvme/model_checkpoint/torch/serialization_orig_v2.6.0.py @@ -0,0 +1,1975 @@ +# mypy: allow-untyped-defs +import copyreg +import difflib +import functools +import io +import os +import pickle +import re +import shutil +import struct +import sys +import tarfile +import tempfile +import threading +import warnings +from contextlib import closing, contextmanager +from enum import Enum +from typing import ( + Any, + BinaryIO, + Callable, + cast, + Dict, + IO, + List, + Optional, + Tuple, + Type, + Union, +) +from typing_extensions import TypeAlias, TypeIs + +import torch +import torch._weights_only_unpickler as _weights_only_unpickler +from torch._sources import get_source_lines_and_file +from torch._utils import _import_dotted_name +from torch.storage import _get_dtype_from_pickle_storage_type +from torch.types import Storage + + +__all__ = [ + "SourceChangeWarning", + "mkdtemp", + "register_package", + "check_module_version_greater_or_equal", + "validate_cuda_device", + "validate_hpu_device", + "location_tag", + "default_restore_location", + "normalize_storage_type", + "storage_to_tensor_type", + "save", + "load", + "StorageType", + "LoadEndianness", + "get_crc32_options", + "set_crc32_options", + "get_default_load_endianness", + "set_default_load_endianness", + "get_default_mmap_options", + "set_default_mmap_options", + "clear_safe_globals", + "get_safe_globals", + "add_safe_globals", + "safe_globals", + "get_unsafe_globals_in_checkpoint", + "skip_data", +] + +DEFAULT_PROTOCOL = 2 + +LONG_SIZE = struct.Struct("=l").size +INT_SIZE = struct.Struct("=i").size +SHORT_SIZE = struct.Struct("=h").size + +MAGIC_NUMBER = 0x1950A86A20F9469CFC6C +PROTOCOL_VERSION = 1001 +STORAGE_KEY_SEPARATOR = "," + +FILE_LIKE: TypeAlias = Union[str, os.PathLike, BinaryIO, IO[bytes]] +MAP_LOCATION: TypeAlias = Optional[ + Union[Callable[[Storage, str], Storage], torch.device, str, Dict[str, str]] +] +STORAGE: TypeAlias = Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage] + +IS_WINDOWS = sys.platform == "win32" + +UNSAFE_MESSAGE = ( + "In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` " + "from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, " + "but it can result in arbitrary code execution. Do it only if you got the file from a " + "trusted source." +) + +if not IS_WINDOWS: + from mmap import MAP_PRIVATE, MAP_SHARED +else: + MAP_SHARED, MAP_PRIVATE = None, None # type: ignore[assignment] + + +def _default_to_weights_only(pickle_module): + is_fbcode = not hasattr(torch.version, "git_version") + return pickle_module is None and not is_fbcode + + +# _serialization_tls is used to store thread local state specific to serialization +# that needs to be propagated to other files, in particular we use this for +# (1) map_location (needed for wrapper subclasses/third party devices to torch._utils) +# (2) skip_data (needed for torch.Tensor.__reduce_ex__ for skip_data ctx) +# (3) materialize_fake_tensors (needed for torch.Tensor.__reduce_ex__ for skip_data ctx) +class _SerializationLocal(threading.local): + def __init__(self): + super().__init__() + self.map_location: Optional[MAP_LOCATION] = None + self.skip_data: bool = False + self.materialize_fake_tensors: bool = False + + +_serialization_tls = _SerializationLocal() + + +class SourceChangeWarning(Warning): + pass + + +@contextmanager +def mkdtemp(): + path = tempfile.mkdtemp() + try: + yield path + finally: + shutil.rmtree(path) + + +_package_registry: List[ + Tuple[ + int, + Callable[[STORAGE], Optional[str]], + Callable[[STORAGE, str], Optional[STORAGE]], + ] +] = [] + + +class LoadEndianness(Enum): + NATIVE = 1 + LITTLE = 2 + BIG = 3 + + +_default_load_endian: Optional[LoadEndianness] = None + + +def get_default_load_endianness() -> Optional[LoadEndianness]: + """ + Get fallback byte order for loading files + + If byteorder mark is not present in saved checkpoint, + this byte order is used as fallback. + By default, it's "native" byte order. + + Returns: + default_load_endian: Optional[LoadEndianness] + """ + return _default_load_endian + + +def set_default_load_endianness(endianness): + """ + Set fallback byte order for loading files + + If byteorder mark is not present in saved checkpoint, + this byte order is used as fallback. + By default, it's "native" byte order. + + Args: + endianness: the new fallback byte order + """ + global _default_load_endian + if not isinstance(endianness, LoadEndianness) and endianness is not None: + raise TypeError("Invalid argument type in function set_default_load_endianness") + _default_load_endian = endianness + + +_compute_crc32: bool = True + + +def get_crc32_options() -> bool: + """ + Get whether :func:`torch.save` computes and writes crc32 for each record. + + Defaults to ``True``. + """ + return _compute_crc32 + + +def set_crc32_options(compute_crc32: bool): + """ + Set whether :func:`torch.save` computes and writes crc32 for each record. + + .. note:: + Setting this to ``False`` may make unzipping of the ``torch.save`` output + fail or warn due to corrupted CRC32. However ``torch.load`` will be + able to load the file. + + Args: + compute_crc32 (bool): set crc32 compuation flag + """ + global _compute_crc32 + _compute_crc32 = compute_crc32 + + +_default_mmap_options: int = MAP_PRIVATE + + +def get_default_mmap_options() -> int: + """ + Get default mmap options for :func:`torch.load` with ``mmap=True``. + + Defaults to ``mmap.MAP_PRIVATE``. + + + Returns: + default_mmap_options: int + """ + return _default_mmap_options + + +class set_default_mmap_options: + """ + Context manager or function to set default mmap options for :func:`torch.load` with ``mmap=True`` to flags. + + For now, only either ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED`` are supported. + Please open an issue if you need any other option to be added here. + + .. note:: + This feature is currently not supported for Windows. + + Args: + flags: ``mmap.MAP_PRIVATE`` or ``mmap.MAP_SHARED`` + """ + + def __init__(self, flags: int) -> None: + if IS_WINDOWS: + raise RuntimeError( + "Changing the default mmap options is currently not supported for Windows" + ) + if flags != MAP_PRIVATE and flags != MAP_SHARED: + raise ValueError( + "Invalid argument in function set_default_mmap_options, " + f"expected mmap.MAP_PRIVATE or mmap.MAP_SHARED, but got {flags}" + ) + global _default_mmap_options + self.prev = _default_mmap_options + _default_mmap_options = flags + + def __enter__(self) -> None: + pass + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + global _default_mmap_options + _default_mmap_options = self.prev + + +def clear_safe_globals() -> None: + """ + Clears the list of globals that are safe for ``weights_only`` load. + """ + _weights_only_unpickler._clear_safe_globals() + + +def get_safe_globals() -> List[Union[Callable, Tuple[Callable, str]]]: + """ + Returns the list of user-added globals that are safe for ``weights_only`` load. + """ + return _weights_only_unpickler._get_safe_globals() + + +def add_safe_globals(safe_globals: List[Union[Callable, Tuple[Callable, str]]]) -> None: + """ + Marks the given globals as safe for ``weights_only`` load. For example, functions + added to this list can be called during unpickling, classes could be instantiated + and have state set. + + Each item in the list can either be a function/class or a tuple of the form + (function/class, string) where string is the full path of the function/class. + + Within the serialized format, each function is identified with its full + path as ``{__module__}.{__name__}``. When calling this API, you can provide this + full path that should match the one in the checkpoint otherwise the default + ``{fn.__module__}.{fn.__name__}`` will be used. + + Args: + safe_globals (List[Union[Callable, Tuple[Callable, str]]]): list of globals to mark as safe + + Example: + >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization") + >>> import tempfile + >>> class MyTensor(torch.Tensor): + ... pass + >>> t = MyTensor(torch.randn(2, 3)) + >>> with tempfile.NamedTemporaryFile() as f: + ... torch.save(t, f.name) + # Running `torch.load(f.name, weights_only=True)` will fail with + # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default. + # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint. + ... torch.serialization.add_safe_globals([MyTensor]) + ... torch.load(f.name, weights_only=True) + # MyTensor([[-0.5024, -1.8152, -0.5455], + # [-0.8234, 2.0500, -0.3657]]) + """ + _weights_only_unpickler._add_safe_globals(safe_globals) + + +class safe_globals(_weights_only_unpickler._safe_globals): + r"""Context-manager that adds certain globals as safe for ``weights_only`` load. + + Args: + safe_globals: List of globals for weights_only load. + + Example: + >>> # xdoctest: +SKIP("Can't torch.save(t, ...) as doctest thinks MyTensor is defined on torch.serialization") + >>> import tempfile + >>> class MyTensor(torch.Tensor): + ... pass + >>> t = MyTensor(torch.randn(2, 3)) + >>> with tempfile.NamedTemporaryFile() as f: + ... torch.save(t, f.name) + # Running `torch.load(f.name, weights_only=True)` will fail with + # Unsupported global: GLOBAL __main__.MyTensor was not an allowed global by default. + # Check the code and make sure MyTensor is safe to be used when loaded from an arbitrary checkpoint. + ... with torch.serialization.safe_globals([MyTensor]): + ... torch.load(f.name, weights_only=True) + # MyTensor([[-0.5024, -1.8152, -0.5455], + # [-0.8234, 2.0500, -0.3657]]) + >>> assert torch.serialization.get_safe_globals() == [] + """ + + +def get_unsafe_globals_in_checkpoint(f: FILE_LIKE) -> List[str]: + """Returns a list of strings of functions/classes in a ``torch.save`` object that are not safe for ``weights_only``. + + For a given function or class ``f``, the corresponding string will be of the form + ``{f.__module__}.{f.__name__}``. + + This function will return any GLOBALs in the checkpoint that are not in the set marked safe + for ``weights_only`` (either via :func:`add_safe_globals` or :class:`safe_globals` context or + allowlisted by ``torch`` by default). + + .. note:: + This function will statically disassemble the pickle file in the checkpoint. + The implication is any classes dynamically pushed onto the stack during unpickling + will not be included in the output. + + Args: + f: File-like object or string containing the checkpoint object saved via ``torch.save`` + + Returns: + A list of strings of pickle GLOBALs in the checkpoint that are not allowlisted for ``weights_only``. + """ + default_safe_globals_strings = set( + _weights_only_unpickler._get_allowed_globals().keys() + ) + user_safe_global_strings = set( + _weights_only_unpickler._get_user_allowed_globals().keys() + ) + safe_global_strings = default_safe_globals_strings.union(user_safe_global_strings) + + with _open_file_like(f, "rb") as opened_file: + if not _is_zipfile(opened_file): + raise ValueError("Expected input to be a checkpoint returned by torch.save") + with _open_zipfile_reader(opened_file) as zip_file: + if _is_torchscript_zip(zip_file): + raise ValueError( + "Expected input to be a checkpoint returned by torch.save but got a torchscript checkpoint" + ) + data_file = io.BytesIO(zip_file.get_record("data.pkl")) + all_globals = _weights_only_unpickler.get_globals_in_pkl(data_file) + return list(all_globals.difference(safe_global_strings)) + + +class skip_data: + """ + Context-manager that skips writing storage bytes for ``torch.save`` calls. + + Storages will still be saved, but the space that their bytes would usually be written to + will be empty space. The storage bytes can then be populated in a separate pass. + + .. warning:: + The ``skip_data`` context manager is an early prototype and is subject to change. + + Args: + materialize_fake_tensors: Whether to materialize FakeTensors. + + Example: + >>> # xdoctest: +SKIP("NamedTemporaryFile on Windows") + >>> import tempfile + >>> t = torch.randn(2, 3) + >>> with tempfile.NamedTemporaryFile() as f: + ... with torch.serialization.skip_data(): + ... torch.save(t, f.name) + ... torch.load(f.name, weights_only=True) + tensor([[0., 0., 0.], + [0., 0., 0.]]) + """ + + def __init__(self, materialize_fake_tensors: bool = False): + self.materialize_fake_tensors = materialize_fake_tensors + + def __enter__(self): + global _serialization_tls + self._old_skip_data = _serialization_tls.skip_data + self._old_materialize_fake_tensors = _serialization_tls.materialize_fake_tensors + _serialization_tls.skip_data = True + _serialization_tls.materialize_fake_tensors = self.materialize_fake_tensors + + def __exit__(self, type, value, tb): + global _serialization_tls + _serialization_tls.skip_data = self._old_skip_data + _serialization_tls.materialize_fake_tensors = self._old_materialize_fake_tensors + + +def _is_zipfile(f) -> bool: + # This is a stricter implementation than zipfile.is_zipfile(). + # zipfile.is_zipfile() is True if the magic number appears anywhere in the + # binary. Since we expect the files here to be generated by torch.save or + # torch.jit.save, it's safe to only check the start bytes and avoid + # collisions and assume the zip has only 1 file. + # See bugs.python.org/issue28494. + + start = f.tell() + # Read the first few bytes and match against the ZIP file signature + local_header_magic_number = b"PK\x03\x04" + read_bytes = f.read(len(local_header_magic_number)) + f.seek(start) + return read_bytes == local_header_magic_number + + +def register_package( + priority: int, + tagger: Callable[[STORAGE], Optional[str]], + deserializer: Callable[[STORAGE, str], Optional[STORAGE]], +): + """ + Registers callables for tagging and deserializing storage objects with an associated priority. + Tagging associates a device with a storage object at save time while deserializing moves a + storage object to an appropriate device at load time. :attr:`tagger` and :attr:`deserializer` + are run in the order given by their :attr:`priority` until a tagger/deserializer returns a + value that is not `None`. + + To override the deserialization behavior for a device in the global registry, one can register a + tagger with a higher priority than the existing tagger. + + This function can also be used to register a tagger and deserializer for new devices. + + Args: + priority: Indicates the priority associated with the tagger and deserializer, where a lower + value indicates higher priority. + tagger: Callable that takes in a storage object and returns its tagged device as a string + or None. + deserializer: Callable that takes in storage object and a device string and returns a storage + object on the appropriate device or None. + + Returns: + `None` + + Example: + >>> def ipu_tag(obj): + >>> if obj.device.type == 'ipu': + >>> return 'ipu' + >>> def ipu_deserialize(obj, location): + >>> if location.startswith('ipu'): + >>> ipu = getattr(torch, "ipu", None) + >>> assert ipu is not None, "IPU device module is not loaded" + >>> assert torch.ipu.is_available(), "ipu is not available" + >>> return obj.ipu(location) + >>> torch.serialization.register_package(11, ipu_tag, ipu_deserialize) + """ + queue_elem = (priority, tagger, deserializer) + _package_registry.append(queue_elem) + _package_registry.sort() + + +def check_module_version_greater_or_equal( + module, + req_version_tuple, + error_if_malformed=True, +): + """ + Check if a module's version satisfies requirements + + Usually, a module's version string will be like 'x.y.z', which would be represented + as a tuple (x, y, z), but sometimes it could be an unexpected format. If the version + string does not match the given tuple's format up to the length of the tuple, then + error and exit or emit a warning. + + Args: + module: the module to check the version of + req_version_tuple: tuple (usually of ints) representing the required version + error_if_malformed: whether we should exit if module version string is malformed + + Returns: + requirement_is_met: bool + """ + try: + version_strs = module.__version__.split(".") + # Cast module version fields to match the types of the required version + module_version = tuple( + type(req_field)(version_strs[idx]) + for idx, req_field in enumerate(req_version_tuple) + ) + requirement_is_met = module_version >= req_version_tuple + + except Exception as e: + message = ( + f"'{module.__name__}' module version string is malformed '{module.__version__}' and cannot be compared" + f" with tuple {str(req_version_tuple)}" + ) + if error_if_malformed: + raise RuntimeError(message) from e + else: + warnings.warn(message + ", but continuing assuming that requirement is met") + requirement_is_met = True + + return requirement_is_met + + +def _cpu_tag(obj): + if obj.device.type == "cpu": + return "cpu" + + +def _mps_tag(obj): + if obj.device.type == "mps": + return "mps" + + +def _meta_tag(obj): + if obj.device.type == "meta": + return "meta" + + +def _backend_tag(backend_name, obj): + if backend_name == "privateuse1": + backend_name = torch._C._get_privateuse1_backend_name() + if obj.device.type == backend_name: + if obj.device.index is None: + return backend_name + else: + return backend_name + ":" + str(obj.device.index) + + +def _cpu_deserialize(obj, location): + if location == "cpu": + return obj + + +def _mps_deserialize(obj, location): + if location.startswith("mps"): + return obj.mps() + + +def _meta_deserialize(obj, location): + if location == "meta": + return torch.UntypedStorage(obj.nbytes(), device="meta") + + +def _validate_device(location, backend_name): + """ + Check whether the device index of specified backend is valid + + In case of privateuse1 backend, your must first register a device_module for + privateuse1 using torch._register_device_module. Implement the following + methods in device_module like cuda: device_module._utils._get_device_index(location, True), + device_module.device_count(). + + Args: + location: string of device + backend_name: the backend name or the name of privateuse1, which can be renamed + + Returns: + device_index: int + """ + if not hasattr(torch, backend_name): + raise RuntimeError( + f"The {backend_name.upper()} device module is not registered. " + "If you are running on a CPU-only machine, " + "please use torch.load with map_location=torch.device('cpu') " + "to map your storages to the CPU." + ) + device_module = getattr(torch, backend_name) + if hasattr(device_module, "_utils") and hasattr( + device_module._utils, "_get_device_index" + ): + device_index = device_module._utils._get_device_index(location, True) + device = torch.device(backend_name, device_index) + else: + device = torch.device(location) + device_index = device.index if device.index else 0 + if hasattr(device_module, "is_available") and not device_module.is_available(): + raise RuntimeError( + f"Attempting to deserialize object on a {backend_name.upper()} " + f"device but torch.{backend_name}.is_available() is False. " + "If you are running on a CPU-only machine, " + "please use torch.load with map_location=torch.device('cpu') " + "to map your storages to the CPU." + ) + if hasattr(device_module, "device_count"): + device_count = device_module.device_count() + if device_index >= device_count: + raise RuntimeError( + f"Attempting to deserialize object on {backend_name.upper()} device " + f"{device_index} but torch.{backend_name}.device_count() is {device_count}. " + "Please use torch.load with map_location to map your storages " + "to an existing device." + ) + return device + + +def validate_cuda_device(location): + return _validate_device(location, "cuda").index + + +def validate_hpu_device(location): + return _validate_device(location, "hpu").index + + +def _deserialize(backend_name, obj, location): + if backend_name == "privateuse1": + backend_name = torch._C._get_privateuse1_backend_name() + if location.startswith(backend_name): + device = _validate_device(location, backend_name) + return obj.to(device=device) + + +register_package(10, _cpu_tag, _cpu_deserialize) +register_package( + 20, + functools.partial(_backend_tag, "cuda"), + functools.partial(_deserialize, "cuda"), +) +register_package(21, _mps_tag, _mps_deserialize) +register_package(22, _meta_tag, _meta_deserialize) +register_package( + 23, + functools.partial(_backend_tag, "privateuse1"), + functools.partial(_deserialize, "privateuse1"), +) +register_package( + 24, + functools.partial(_backend_tag, "hpu"), + functools.partial(_deserialize, "hpu"), +) +register_package( + 25, + functools.partial(_backend_tag, "xpu"), + functools.partial(_deserialize, "xpu"), +) + + +def location_tag( + storage: Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage], +): + for _, tagger, _ in _package_registry: + location = tagger(storage) + if location: + return location + raise RuntimeError( + "don't know how to determine data location of " + torch.typename(storage) + ) + + +def default_restore_location(storage, location): + """ + Restores `storage` using a deserializer function registered for the `location`. + + This function looks in the registry for deserializer functions that match the `location`. + If found, it attempts to use them, in priority order, to restore `storage` until one + returns a not `None` result. If no deserializer can be found in the registry, or all found fail + to bear a result, it raises a `RuntimeError`. + + Args: + storage (STORAGE): the storage object to restore + location (str): the location tag associated with the storage object + + Returns: + storage: Optional[STORAGE] + + Raises: + RuntimeError: If no deserializer matching `location` is found in the registry or if + all matching ones return `None`. + """ + for _, _, fn in _package_registry: + result = fn(storage, location) + if result is not None: + return result + raise RuntimeError( + "don't know how to restore data location of " + + torch.typename(storage) + + " (tagged with " + + location + + ")" + ) + + +def normalize_storage_type(storage_type): + return getattr(torch, storage_type.__name__) + + +def storage_to_tensor_type(storage): + storage_type = type(storage) + module = _import_dotted_name(storage_type.__module__) + return getattr(module, storage_type.__name__.replace("Storage", "Tensor")) + + +def _is_path(name_or_buffer) -> TypeIs[Union[str, os.PathLike]]: + return isinstance(name_or_buffer, (str, os.PathLike)) + + +class _opener: + def __init__(self, file_like): + self.file_like = file_like + + def __enter__(self): + return self.file_like + + def __exit__(self, *args): + pass + + +class _open_file(_opener): + def __init__(self, name, mode): + super().__init__(open(name, mode)) + + def __exit__(self, *args): + self.file_like.close() + + +class _open_buffer_reader(_opener): + def __init__(self, buffer): + super().__init__(buffer) + _check_seekable(buffer) + + +class _open_buffer_writer(_opener): + def __exit__(self, *args): + self.file_like.flush() + + +def _open_file_like(name_or_buffer, mode): + if _is_path(name_or_buffer): + return _open_file(name_or_buffer, mode) + else: + if "w" in mode: + return _open_buffer_writer(name_or_buffer) + elif "r" in mode: + return _open_buffer_reader(name_or_buffer) + else: + raise RuntimeError(f"Expected 'r' or 'w' in mode but got {mode}") + + +class _open_zipfile_reader(_opener): + def __init__(self, name_or_buffer) -> None: + super().__init__(torch._C.PyTorchFileReader(name_or_buffer)) + + +class _open_zipfile_writer_file(_opener): + def __init__(self, name) -> None: + self.file_stream = None + self.name = str(name) + try: + self.name.encode("ascii") + except UnicodeEncodeError: + # PyTorchFileWriter only supports ascii filename. + # For filenames with non-ascii characters, we rely on Python + # for writing out the file. + self.file_stream = io.FileIO(self.name, mode="w") + super().__init__( + torch._C.PyTorchFileWriter(self.file_stream, _compute_crc32) + ) + else: + super().__init__(torch._C.PyTorchFileWriter(self.name, _compute_crc32)) + + def __exit__(self, *args) -> None: + self.file_like.write_end_of_file() + if self.file_stream is not None: + self.file_stream.close() + + +class _open_zipfile_writer_buffer(_opener): + def __init__(self, buffer) -> None: + if not callable(getattr(buffer, "write", None)): + msg = f"Buffer of {str(type(buffer)).strip('<>')} has no callable attribute 'write'" + if not hasattr(buffer, "write"): + raise AttributeError(msg) + raise TypeError(msg) + self.buffer = buffer + super().__init__(torch._C.PyTorchFileWriter(buffer, _compute_crc32)) + + def __exit__(self, *args) -> None: + self.file_like.write_end_of_file() + self.buffer.flush() + + +def _open_zipfile_writer(name_or_buffer): + container: Type[_opener] + if _is_path(name_or_buffer): + container = _open_zipfile_writer_file + else: + container = _open_zipfile_writer_buffer + return container(name_or_buffer) + + +def _is_compressed_file(f) -> bool: + compress_modules = ["gzip"] + try: + return f.__module__ in compress_modules + except AttributeError: + return False + + +def _should_read_directly(f): + """ + Checks if f is a file that should be read directly. It should be read + directly if it is backed by a real file (has a fileno) and is not a + a compressed file (e.g. gzip) + """ + if _is_compressed_file(f): + return False + try: + return f.fileno() >= 0 + except io.UnsupportedOperation: + return False + except AttributeError: + return False + + +def _check_seekable(f) -> bool: + def raise_err_msg(patterns, e): + for p in patterns: + if p in str(e): + msg = ( + str(e) + + ". You can only torch.load from a file that is seekable." + + " Please pre-load the data into a buffer like io.BytesIO and" + + " try to load from it instead." + ) + raise type(e)(msg) + raise e + + try: + f.seek(f.tell()) + return True + except (io.UnsupportedOperation, AttributeError) as e: + raise_err_msg(["seek", "tell"], e) + return False + + +def _check_dill_version(pickle_module) -> None: + """Checks if using dill as the pickle module, and if so, checks if it is the correct version. + If dill version is lower than 0.3.1, a ValueError is raised. + + Args: + pickle_module: module used for pickling metadata and objects + + """ + if pickle_module is not None and pickle_module.__name__ == "dill": + required_dill_version = (0, 3, 1) + if not check_module_version_greater_or_equal( + pickle_module, required_dill_version, False + ): + raise ValueError( + ( + "'torch' supports dill >= {}, but you have dill {}." + " Please upgrade dill or switch to 'pickle'" + ).format( + ".".join([str(num) for num in required_dill_version]), + pickle_module.__version__, + ) + ) + + +def _check_save_filelike(f): + if not _is_path(f) and not hasattr(f, "write"): + raise AttributeError( + "expected 'f' to be string, path, or a file-like object with " + "a 'write' attribute" + ) + + +def save( + obj: object, + f: FILE_LIKE, + pickle_module: Any = pickle, + pickle_protocol: int = DEFAULT_PROTOCOL, + _use_new_zipfile_serialization: bool = True, + _disable_byteorder_record: bool = False, +) -> None: + # Reference: https://github.com/pytorch/pytorch/issues/54354 + # The first line of this docstring overrides the one Sphinx generates for the + # documentation. We need it so that Sphinx doesn't leak `pickle`s path from + # the build environment (e.g. `>> # xdoctest: +SKIP("makes cwd dirty") + >>> # Save to file + >>> x = torch.tensor([0, 1, 2, 3, 4]) + >>> torch.save(x, "tensor.pt") + >>> # Save to io.BytesIO buffer + >>> buffer = io.BytesIO() + >>> torch.save(x, buffer) + """ + torch._C._log_api_usage_once("torch.save") + _check_dill_version(pickle_module) + _check_save_filelike(f) + + if _use_new_zipfile_serialization: + with _open_zipfile_writer(f) as opened_zipfile: + _save( + obj, + opened_zipfile, + pickle_module, + pickle_protocol, + _disable_byteorder_record, + ) + return + else: + global _serialization_tls + if _serialization_tls.skip_data: + raise RuntimeError( + "Cannot use skip_data=True with _use_new_zipfile_serialization=False" + ) + with _open_file_like(f, "wb") as opened_file: + _legacy_save(obj, opened_file, pickle_module, pickle_protocol) + + +def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None: + import torch.nn as nn + + serialized_container_types = {} + serialized_storages: Dict[str, Tuple[torch.UntypedStorage, torch.dtype]] = {} + + # Since loading storages that view the same data with different dtypes is + # not supported, we need to keep track of the dtype associated with each + # storage data_ptr and throw an error if the dtype is ever different. + # TODO: This feature could be added in the future + storage_dtypes: Dict[int, torch.dtype] = {} + + def persistent_id(obj: Any) -> Optional[Tuple]: + # FIXME: the docs say that persistent_id should only return a string + # but torch store returns tuples. This works only in the binary protocol + # see + # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects + # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 + if isinstance(obj, type) and issubclass(obj, nn.Module): + if obj in serialized_container_types: + return None + serialized_container_types[obj] = True + source_file = source = None + try: + source_lines, _, source_file = get_source_lines_and_file(obj) + source = "".join(source_lines) + except ( + Exception + ): # saving the source is optional, so we can ignore any errors + warnings.warn( + "Couldn't retrieve source code for container of " + "type " + obj.__name__ + ". It won't be checked " + "for correctness upon loading." + ) + return ("module", obj, source_file, source) + + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + storage: torch.UntypedStorage + + if isinstance(obj, torch.storage.TypedStorage): + # TODO: Once we decide to break serialization FC, this case + # can be deleted + storage = obj._untyped_storage + storage_dtype = obj.dtype + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + dtype = obj.dtype + storage_numel = obj._size() + + elif isinstance(obj, torch.UntypedStorage): + storage = obj + storage_dtype = torch.uint8 + storage_type = normalize_storage_type(type(obj)) + dtype = torch.uint8 + storage_numel = storage.nbytes() + else: + raise TypeError(f"type not recognized: {type(obj)}") + + # If storage is allocated, ensure that any other saved storages + # pointing to the same data all have the same dtype. If storage is + # not allocated, don't perform this check + if storage.data_ptr() != 0: + if storage.data_ptr() in storage_dtypes: + if storage_dtype != storage_dtypes[storage.data_ptr()]: + raise RuntimeError( + "Cannot save multiple tensors or storages that " + "view the same data as different types" + ) + else: + storage_dtypes[storage.data_ptr()] = storage_dtype + + view_metadata: Optional[Tuple[str, int, int]] + + # Offset is always 0, but we keep it for backwards compatibility + # with the old serialization format (which supported storage views) + offset = 0 + storage_key = str(storage._cdata) + location = location_tag(storage) + + # TODO: There's an issue here with FC. It might be impossible to + # solve, but it's worth noting. Imagine we save a list `[storage, + # tensor]`, where `tensor.storage()` is the same as `storage`, and + # `tensor.element_size() > 1`. Let's say that `tensor.dtype == + # torch.float`. The storage will be serialized with element size + # of 1, since we're choosing to serialize the first occurance of + # a duplicate storage. Since this legacy serialization format saves + # the numel of the storage, rather than nbytes directly, we'll be + # effectively saving nbytes in this case. We'll be able to load it + # and the tensor back up with no problems in _this_ and future + # versions of pytorch, but in older versions, here's the problem: + # the storage will be loaded up as a UntypedStorage, and then the + # FloatTensor will loaded and the UntypedStorage will be assigned to + # it. Since the storage dtype does not match the tensor dtype, this + # will cause an error. If we reverse the list, like `[tensor, + # storage]`, then we will save the `tensor.storage()` as a faked + # `FloatStorage`, and the saved size will be the correct + # dtype-specific numel count that old versions expect. `tensor` + # will be able to load up properly in old versions, pointing to + # a FloatStorage. However, `storage` is still being translated to + # a UntypedStorage, and it will try to resolve to the same + # FloatStorage that `tensor` contains. This will also cause an + # error. It doesn't seem like there's any way around this. + # Probably, we just cannot maintain FC for the legacy format if the + # saved list contains both a tensor and a storage that point to the + # same data. We should still be able to maintain FC for lists of + # just tensors, as long as all views share the same dtype as the + # tensor they are viewing. + + if storage_key not in serialized_storages: + serialized_storages[storage_key] = (storage, dtype) + is_view = storage._cdata != storage._cdata + if is_view: + view_metadata = (str(storage._cdata), offset, storage.nbytes()) + else: + view_metadata = None + + res = ( + "storage", + storage_type, + storage_key, + location, + storage_numel, + view_metadata, + ) + return res + return None + + sys_info = dict( + protocol_version=PROTOCOL_VERSION, + little_endian=sys.byteorder == "little", + type_sizes=dict( + short=SHORT_SIZE, + int=INT_SIZE, + long=LONG_SIZE, + ), + ) + + pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol) + pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol) + pickle_module.dump(sys_info, f, protocol=pickle_protocol) + + class PyTorchLegacyPickler(pickle_module.Pickler): + def persistent_id(self, obj): + return persistent_id(obj) + + pickler = PyTorchLegacyPickler(f, protocol=pickle_protocol) + pickler.dump(obj) + + serialized_storage_keys = sorted(serialized_storages.keys()) + pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol) + f.flush() + for key in serialized_storage_keys: + storage, dtype = serialized_storages[key] + storage._write_file( + f, _should_read_directly(f), True, torch._utils._element_size(dtype) + ) + + +def _save( + obj, + zip_file, + pickle_module, + pickle_protocol, + _disable_byteorder_record, +): + serialized_storages = {} + id_map: Dict[int, str] = {} + + # Since loading storages that view the same data with different dtypes is + # not supported, we need to keep track of the dtype associated with each + # storage data_ptr and throw an error if the dtype is ever different. + # TODO: This feature could be added in the future + storage_dtypes: Dict[int, torch.dtype] = {} + + def persistent_id(obj): + # FIXME: the docs say that persistent_id should only return a string + # but torch store returns tuples. This works only in the binary protocol + # see + # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects + # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 + if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + if isinstance(obj, torch.storage.TypedStorage): + # TODO: Once we decide to break serialization FC, this case + # can be deleted + storage = obj._untyped_storage + storage_dtype = obj.dtype + storage_type_str = obj._pickle_storage_type() + storage_type = getattr(torch, storage_type_str) + storage_numel = obj._size() + + else: + storage = obj + storage_dtype = torch.uint8 + storage_type = normalize_storage_type(type(obj)) + storage_numel = storage.nbytes() + + # If storage is allocated, ensure that any other saved storages + # pointing to the same data all have the same dtype. If storage is + # not allocated, don't perform this check + if str(storage.device) != "meta" and storage.data_ptr() != 0: + if storage.data_ptr() in storage_dtypes: + if storage_dtype != storage_dtypes[storage.data_ptr()]: + raise RuntimeError( + "Cannot save multiple tensors or storages that " + "view the same data as different types" + ) + else: + storage_dtypes[storage.data_ptr()] = storage_dtype + + storage_key = id_map.setdefault(storage._cdata, str(len(id_map))) + if hasattr(obj, "_fake_device") and obj._fake_device is not None: + location = str(obj._fake_device) + else: + location = location_tag(storage) + serialized_storages[storage_key] = storage + + return ("storage", storage_type, storage_key, location, storage_numel) + + return None + + # Write the pickle data for `obj` + data_buf = io.BytesIO() + + class PyTorchPickler(pickle_module.Pickler): # type: ignore[name-defined] + def persistent_id(self, obj): + return persistent_id(obj) + + pickler = PyTorchPickler(data_buf, protocol=pickle_protocol) + pickler.dump(obj) + data_value = data_buf.getvalue() + zip_file.write_record("data.pkl", data_value, len(data_value)) + + # Write byte order marker + if not _disable_byteorder_record: + if sys.byteorder not in ["little", "big"]: + raise ValueError("Unknown endianness type: " + sys.byteorder) + + zip_file.write_record("byteorder", sys.byteorder, len(sys.byteorder)) + + # Write each tensor to a file named tensor/the_tensor_key in the zip archive + for key in sorted(serialized_storages.keys()): + name = f"data/{key}" + storage = serialized_storages[key] + num_bytes = storage.nbytes() + global _serialization_tls + if _serialization_tls.skip_data: + zip_file.write_record_metadata(name, num_bytes) + else: + # given that we copy things around anyway, we might use storage.cpu() + # this means to that to get tensors serialized, you need to implement + # .cpu() on the underlying Storage + if storage.device.type != "cpu": + storage = storage.cpu() + # Now that it is on the CPU we can directly copy it into the zip file + zip_file.write_record(name, storage, num_bytes) + + +def load( + f: FILE_LIKE, + map_location: MAP_LOCATION = None, + pickle_module: Any = None, + *, + weights_only: Optional[bool] = None, + mmap: Optional[bool] = None, + **pickle_load_args: Any, +) -> Any: + # Reference: https://github.com/pytorch/pytorch/issues/54354 + # The first line of this docstring overrides the one Sphinx generates for the + # documentation. We need it so that Sphinx doesn't leak `pickle`s path from + # the build environment (e.g. `>> # xdoctest: +SKIP("undefined filepaths") + >>> torch.load("tensors.pt", weights_only=True) + # Load all tensors onto the CPU + >>> torch.load("tensors.pt", map_location=torch.device("cpu"), weights_only=True) + # Load all tensors onto the CPU, using a function + >>> torch.load( + ... "tensors.pt", map_location=lambda storage, loc: storage, weights_only=True + ... ) + # Load all tensors onto GPU 1 + >>> torch.load( + ... "tensors.pt", + ... map_location=lambda storage, loc: storage.cuda(1), + ... weights_only=True, + ... ) # type: ignore[attr-defined] + # Map tensors from GPU 1 to GPU 0 + >>> torch.load("tensors.pt", map_location={"cuda:1": "cuda:0"}, weights_only=True) + # Load tensor from io.BytesIO object + # Loading from a buffer setting weights_only=False, warning this can be unsafe + >>> with open("tensor.pt", "rb") as f: + ... buffer = io.BytesIO(f.read()) + >>> torch.load(buffer, weights_only=False) + # Load a module with 'ascii' encoding for unpickling + # Loading from a module setting weights_only=False, warning this can be unsafe + >>> torch.load("module.pt", encoding="ascii", weights_only=False) + """ + torch._C._log_api_usage_once("torch.load") + DOCS_MESSAGE = ( + "\n\nCheck the documentation of torch.load to learn more about types accepted by default with " + "weights_only https://pytorch.org/docs/stable/generated/torch.load.html." + ) + + def _get_wo_message(message: str) -> str: + unsafe_global_pattern = r"GLOBAL (\S+) was not an allowed global by default." + has_unsafe_global = re.search(unsafe_global_pattern, message) is not None + blocklist_pattern = r"whose module (\S+) is blocked" + has_blocklist = re.search(blocklist_pattern, message) is not None + import_pattern = r"(\S+) must be (\S+) to load" + has_import = re.search(import_pattern, message) is not None + if has_unsafe_global: + updated_message = ( + "Weights only load failed. This file can still be loaded, to do so you have two options, " + "\033[1mdo those steps only if you trust the source of the checkpoint\033[0m. " + f"\n\t(1) {UNSAFE_MESSAGE}\n\t(2) Alternatively, to load with `weights_only=True` please check " + "the recommended steps in the following error message.\n\tWeightsUnpickler error: " + + message + ) + else: + if has_import: + return f"Weights only load failed. {message}\n {UNSAFE_MESSAGE}\n" + else: + updated_message = f"Weights only load failed. {UNSAFE_MESSAGE}\n" + if not has_blocklist: + updated_message += ( + "Please file an issue with the following so that we can make " + "`weights_only=True` compatible with your use case: WeightsUnpickler error: " + ) + updated_message += message + return updated_message + DOCS_MESSAGE + + global _serialization_tls + skip_data = _serialization_tls.skip_data + if skip_data: + raise RuntimeError( + "`torch.load` called within a torch.serialization.skip_data context manager " + "is not supported yet. Please call torch.load outside the skip_data context manager." + ) + + weights_only_not_set = weights_only is None + + if weights_only_not_set: + weights_only = _default_to_weights_only(pickle_module) + + true_values = ["1", "y", "yes", "true"] + # Add ability to force safe only or non-safe weight loads via environment variables + force_weights_only_load = ( + os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0") in true_values + ) + force_no_weights_only_load = ( + os.getenv("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "0") in true_values + ) + + if force_weights_only_load and force_no_weights_only_load: + raise RuntimeError( + "Only one of `TORCH_FORCE_WEIGHTS_ONLY_LOAD` or `TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD` " + "should be set, but both were set." + ) + elif force_weights_only_load: + weights_only = True + elif force_no_weights_only_load: + # TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD can only override if callsite did not explicitly set weights_only + if weights_only_not_set: + warnings.warn( + "Environment variable TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD detected, since the" + "`weights_only` argument was not explicitly passed to `torch.load`, forcing weights_only=False.", + UserWarning, + stacklevel=2, + ) + weights_only = False + + if weights_only: + if pickle_module is not None: + raise RuntimeError( + "Can not safely load weights when explicit pickle_module is specified" + ) + else: + if pickle_module is None: + pickle_module = pickle + + # make flipping default BC-compatible + if mmap is None: + mmap = False + + _check_dill_version(pickle_module) + + if "encoding" not in pickle_load_args.keys(): + pickle_load_args["encoding"] = "utf-8" + + with _open_file_like(f, "rb") as opened_file: + if _is_zipfile(opened_file): + # The zipfile reader is going to advance the current file position. + # If we want to actually tail call to torch.jit.load, we need to + # reset back to the original position. + orig_position = opened_file.tell() + overall_storage = None + with _open_zipfile_reader(opened_file) as opened_zipfile: + if _is_torchscript_zip(opened_zipfile): + warnings.warn( + "'torch.load' received a zip file that looks like a TorchScript archive" + " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to" + " silence this warning)", + UserWarning, + ) + if weights_only: + raise RuntimeError( + "Cannot use ``weights_only=True`` with TorchScript archives passed to " + "``torch.load``. " + UNSAFE_MESSAGE + ) + opened_file.seek(orig_position) + return torch.jit.load(opened_file, map_location=map_location) + if mmap: + if not _is_path(f): + raise ValueError( + "f must be a file path in order to use the mmap argument" + ) + size = os.path.getsize(f) + if not IS_WINDOWS: + shared = get_default_mmap_options() == MAP_SHARED + else: + shared = False + overall_storage = torch.UntypedStorage.from_file( + os.fspath(f), shared, size + ) + if weights_only: + try: + return _load( + opened_zipfile, + map_location, + _weights_only_unpickler, + overall_storage=overall_storage, + **pickle_load_args, + ) + except pickle.UnpicklingError as e: + raise pickle.UnpicklingError(_get_wo_message(str(e))) from None + return _load( + opened_zipfile, + map_location, + pickle_module, + overall_storage=overall_storage, + **pickle_load_args, + ) + if mmap: + f_name = "" if not isinstance(f, str) else f"{f}, " + raise RuntimeError( + "mmap can only be used with files saved with " + f"`torch.save({f_name}_use_new_zipfile_serialization=True), " + "please torch.save your checkpoint with this option in order to use mmap." + ) + if weights_only: + try: + return _legacy_load( + opened_file, + map_location, + _weights_only_unpickler, + **pickle_load_args, + ) + except pickle.UnpicklingError as e: + raise pickle.UnpicklingError(_get_wo_message(str(e))) from None + return _legacy_load( + opened_file, map_location, pickle_module, **pickle_load_args + ) + + +# Register pickling support for layout instances such as +# torch.sparse_coo, etc +def _get_layout(name): + """Get layout extension object from its string representation.""" + cache = _get_layout.cache # type: ignore[attr-defined] + if not cache: + for v in torch.__dict__.values(): + if isinstance(v, torch.layout): + cache[str(v)] = v + return cache[name] + + +# There are yet not good way to type annotate function attributes https://github.com/python/mypy/issues/2087 +_get_layout.cache = {} # type: ignore[attr-defined] +copyreg.pickle(torch.layout, lambda obj: (_get_layout, (str(obj),))) + + +def _legacy_load(f, map_location, pickle_module, **pickle_load_args): + deserialized_objects: Dict[int, Any] = {} + + restore_location = _get_restore_location(map_location) + + class UnpicklerWrapper(pickle_module.Unpickler): # type: ignore[name-defined] + def find_class(self, mod_name, name): + if type(name) is str and "Storage" in name: + try: + return StorageType(name) + except KeyError: + pass + return super().find_class(mod_name, name) + + def _check_container_source(container_type, source_file, original_source): + try: + current_source = "".join(get_source_lines_and_file(container_type)[0]) + except Exception: # saving the source is optional, so we can ignore any errors + warnings.warn( + "Couldn't retrieve source code for container of " + "type " + container_type.__name__ + ". It won't be checked " + "for correctness upon loading." + ) + return + if original_source != current_source: + if container_type.dump_patches: + file_name = container_type.__name__ + ".patch" + diff = difflib.unified_diff( + current_source.split("\n"), + original_source.split("\n"), + source_file, + source_file, + lineterm="", + ) + lines = "\n".join(diff) + try: + with open(file_name, "a+") as f: + file_size = f.seek(0, 2) + f.seek(0) + if file_size == 0: + f.write(lines) + elif file_size != len(lines) or f.read() != lines: + raise OSError + msg = ( + "Saved a reverse patch to " + file_name + ". " + "Run `patch -p0 < " + file_name + "` to revert your " + "changes." + ) + except OSError: + msg = ( + "Tried to save a patch, but couldn't create a " + "writable file " + file_name + ". Make sure it " + "doesn't exist and your working directory is " + "writable." + ) + else: + msg = ( + "you can retrieve the original source code by " + "accessing the object's source attribute or set " + "`torch.nn.Module.dump_patches = True` and use the " + "patch tool to revert the changes." + ) + msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}" + warnings.warn(msg, SourceChangeWarning) + + def legacy_load(f): + deserialized_objects: Dict[int, Any] = {} + + def persistent_load(saved_id): + if isinstance(saved_id, tuple): + # Ignore containers that don't have any sources saved + if all(saved_id[1:]): + _check_container_source(*saved_id) + return saved_id[0] + return deserialized_objects[int(saved_id)] + + with closing( + tarfile.open(fileobj=f, mode="r:", format=tarfile.PAX_FORMAT) + ) as tar, mkdtemp() as tmpdir: + if pickle_module is _weights_only_unpickler: + raise RuntimeError( + "Cannot use ``weights_only=True`` with files saved in the " + "legacy .tar format. " + UNSAFE_MESSAGE + ) + tar.extract("storages", path=tmpdir) + with open(os.path.join(tmpdir, "storages"), "rb", 0) as f: + num_storages = pickle_module.load(f, **pickle_load_args) + for _ in range(num_storages): + args = pickle_module.load(f, **pickle_load_args) + key, location, storage_type = args + dtype = storage_type._dtype + obj = cast(Storage, torch.UntypedStorage)._new_with_file( + f, torch._utils._element_size(dtype) + ) + obj = restore_location(obj, location) + # TODO: Once we decide to break serialization FC, we can + # stop wrapping with TypedStorage + deserialized_objects[key] = torch.storage.TypedStorage( + wrap_storage=obj, dtype=dtype, _internal=True + ) + + storage_views = pickle_module.load(f, **pickle_load_args) + for target_cdata, root_cdata, offset, numel in storage_views: + root = deserialized_objects[root_cdata] + element_size = torch._utils._element_size(root.dtype) + offset_bytes = offset * element_size + # TODO: Once we decide to break serialization FC, we can + # stop wrapping with TypedStorage + deserialized_objects[target_cdata] = torch.storage.TypedStorage( + wrap_storage=root._untyped_storage[ + offset_bytes : offset_bytes + numel * element_size + ], + dtype=root.dtype, + _internal=True, + ) + + tar.extract("tensors", path=tmpdir) + with open(os.path.join(tmpdir, "tensors"), "rb", 0) as f: + num_tensors = pickle_module.load(f, **pickle_load_args) + for _ in range(num_tensors): + args = pickle_module.load(f, **pickle_load_args) + key, storage_id, _original_tensor_type = args + storage = deserialized_objects[storage_id] + (ndim,) = struct.unpack(" str: + # When using encoding='bytes' in Py3, some **internal** keys stored as + # strings in Py2 are loaded as bytes. This function decodes them with + # ascii encoding, one that Py3 uses by default. + # + # NOTE: This should only be used on internal keys (e.g., `typename` and + # `location` in `persistent_load` below! + if isinstance(bytes_str, bytes): + return bytes_str.decode("ascii") + return bytes_str + + +def _get_restore_location(map_location): + if map_location is None: + restore_location = default_restore_location + elif isinstance(map_location, dict): + + def restore_location(storage, location): + location = map_location.get(location, location) + return default_restore_location(storage, location) + + elif isinstance(map_location, (str, bytes)): + + def restore_location(storage, location): + return default_restore_location(storage, map_location) + + elif isinstance(map_location, torch.device): + + def restore_location(storage, location): + return default_restore_location(storage, str(map_location)) + + else: + + def restore_location(storage, location): + result = map_location(storage, location) + if result is None: + result = default_restore_location(storage, location) + return result + + return restore_location + + +class StorageType: + def __init__(self, name): + self._dtype = _get_dtype_from_pickle_storage_type(name) + + @property + def dtype(self): + return self._dtype + + def __str__(self): + return f"StorageType(dtype={self.dtype})" + + +def _load( + zip_file, + map_location, + pickle_module, + pickle_file="data.pkl", + overall_storage=None, + **pickle_load_args, +): + restore_location = _get_restore_location(map_location) + + loaded_storages = {} + + # check if byteswapping is needed + byteordername = "byteorder" + byteorderdata = None + if zip_file.has_record(byteordername): + byteorderdata = zip_file.get_record(byteordername) + if byteorderdata not in [b"little", b"big"]: + raise ValueError("Unknown endianness type: " + byteorderdata.decode()) + elif ( + get_default_load_endianness() == LoadEndianness.LITTLE + or get_default_load_endianness() is None + ): + byteorderdata = b"little" + elif get_default_load_endianness() == LoadEndianness.BIG: + byteorderdata = b"big" + elif get_default_load_endianness() == LoadEndianness.NATIVE: + pass + else: + raise ValueError("Invalid load endianness type") + + if ( + not zip_file.has_record(byteordername) + and get_default_load_endianness() is None + and sys.byteorder == "big" + ): + # Default behaviour was changed + # See https://github.com/pytorch/pytorch/issues/101688 + warnings.warn( + "The default load endianness for checkpoints without a byteorder mark " + "on big endian machines was changed from 'native' to 'little' endian, " + "to avoid this behavior please use " + "torch.serialization.set_default_load_endianness to set " + "the desired default load endianness", + UserWarning, + ) + + def load_tensor(dtype, numel, key, location): + name = f"data/{key}" + if torch._guards.detect_fake_mode(None) is not None: + nbytes = numel * torch._utils._element_size(dtype) + storage = torch.UntypedStorage(nbytes, device="meta") + elif overall_storage is not None: + storage_offset = zip_file.get_record_offset(name) + storage = overall_storage[storage_offset : storage_offset + numel] + else: + storage = ( + zip_file.get_storage_from_record(name, numel, torch.UntypedStorage) + ._typed_storage() + ._untyped_storage + ) + # swap here if byteswapping is needed + if byteorderdata is not None: + if byteorderdata.decode() != sys.byteorder: + storage.byteswap(dtype) + + # TODO: Once we decide to break serialization FC, we can + # stop wrapping with TypedStorage + typed_storage = torch.storage.TypedStorage( + wrap_storage=restore_location(storage, location), + dtype=dtype, + _internal=True, + ) + + if typed_storage._data_ptr() != 0: + loaded_storages[key] = typed_storage + + return typed_storage + + def persistent_load(saved_id): + assert isinstance(saved_id, tuple) + typename = _maybe_decode_ascii(saved_id[0]) + data = saved_id[1:] + + assert ( + typename == "storage" + ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'" + storage_type, key, location, numel = data + if storage_type is torch.UntypedStorage: + dtype = torch.uint8 + else: + dtype = storage_type.dtype + + if key in loaded_storages: + typed_storage = loaded_storages[key] + else: + nbytes = numel * torch._utils._element_size(dtype) + typed_storage = load_tensor( + dtype, nbytes, key, _maybe_decode_ascii(location) + ) + + return typed_storage + + load_module_mapping: Dict[str, str] = { + # See https://github.com/pytorch/pytorch/pull/51633 + "torch.tensor": "torch._tensor" + } + + # Need to subclass Unpickler instead of directly monkey-patching the find_class method + # because it's marked readonly in pickle. + # The type: ignore is because mypy can't statically determine the type of this class. + class UnpicklerWrapper(pickle_module.Unpickler): # type: ignore[name-defined] + # from https://stackoverflow.com/questions/13398462/unpickling-python-objects-with-a-changed-module-path/13405732 + # Lets us override the imports that pickle uses when unpickling an object. + # This is useful for maintaining BC if we change a module path that tensor instantiation relies on. + def find_class(self, mod_name, name): + if type(name) is str and "Storage" in name: + try: + return StorageType(name) + except KeyError: + pass + mod_name = load_module_mapping.get(mod_name, mod_name) + return super().find_class(mod_name, name) + + # Load the data (which may in turn use `persistent_load` to load tensors) + data_file = io.BytesIO(zip_file.get_record(pickle_file)) + + unpickler = UnpicklerWrapper(data_file, **pickle_load_args) + unpickler.persistent_load = persistent_load + # Needed for tensors where storage device and rebuild tensor device are + # not connected (wrapper subclasses and tensors rebuilt using numpy) + global _serialization_tls + _serialization_tls.map_location = map_location + result = unpickler.load() + _serialization_tls.map_location = None + + torch._utils._validate_loaded_sparse_tensors() + torch._C._log_api_usage_metadata( + "torch.load.metadata", {"serialization_id": zip_file.serialization_id()} + ) + return result + + +def _is_torchscript_zip(zip_file): + return "constants.pkl" in zip_file.get_all_records() From ee2f08138c8881358a1b04ec7d7b57797bb5202f Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sun, 6 Apr 2025 23:21:15 -0400 Subject: [PATCH 34/40] sglang+zero_inference --- inference/sglang/README.md | 1 + inference/sglang/local_cufile.json | 1 + inference/sglang/run_model.sh | 15 +++++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 inference/sglang/README.md create mode 100644 inference/sglang/local_cufile.json create mode 100644 inference/sglang/run_model.sh diff --git a/inference/sglang/README.md b/inference/sglang/README.md new file mode 100644 index 000000000..590d685f0 --- /dev/null +++ b/inference/sglang/README.md @@ -0,0 +1 @@ +# ZeRO-Inference SGLang examples diff --git a/inference/sglang/local_cufile.json b/inference/sglang/local_cufile.json new file mode 100644 index 000000000..29c51af0c --- /dev/null +++ b/inference/sglang/local_cufile.json @@ -0,0 +1 @@ +{"execution": {"max_io_queue_depth": 64, "max_request_parallelism": 8, "max_io_threads": 8, "parallel_io": true, "min_io_threshold_size_kb": 8192}} \ No newline at end of file diff --git a/inference/sglang/run_model.sh b/inference/sglang/run_model.sh new file mode 100644 index 000000000..3a93b09c8 --- /dev/null +++ b/inference/sglang/run_model.sh @@ -0,0 +1,15 @@ +export LOCAL_RANK=0 +DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" +MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct" +BATCH_SIZE=128 + +# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph + +MODEL_NAME="meta-llama/Llama-3.2-1B" +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference +# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph + + +MODEL_NAME="meta-llama/Meta-Llama-3.1-70B" +# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference +# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph From ad81cecd0b629415ac965e196dcdb4cc328342ca Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sun, 6 Apr 2025 23:28:27 -0400 Subject: [PATCH 35/40] Remove file --- inference/sglang/local_cufile.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 inference/sglang/local_cufile.json diff --git a/inference/sglang/local_cufile.json b/inference/sglang/local_cufile.json deleted file mode 100644 index 29c51af0c..000000000 --- a/inference/sglang/local_cufile.json +++ /dev/null @@ -1 +0,0 @@ -{"execution": {"max_io_queue_depth": 64, "max_request_parallelism": 8, "max_io_threads": 8, "parallel_io": true, "min_io_threshold_size_kb": 8192}} \ No newline at end of file From dff5274e0a9371aece9163b82fd530bc71f6453f Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 8 Apr 2025 06:57:44 -0400 Subject: [PATCH 36/40] Add offload configs --- inference/sglang/ds_offload_cpu.json | 13 +++++++++++++ inference/sglang/ds_offload_nvme_aio.json | 23 +++++++++++++++++++++++ inference/sglang/ds_offload_nvme_gds.json | 23 +++++++++++++++++++++++ inference/sglang/run_model.sh | 8 +++++--- 4 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 inference/sglang/ds_offload_cpu.json create mode 100644 inference/sglang/ds_offload_nvme_aio.json create mode 100644 inference/sglang/ds_offload_nvme_gds.json diff --git a/inference/sglang/ds_offload_cpu.json b/inference/sglang/ds_offload_cpu.json new file mode 100644 index 000000000..9be11bc84 --- /dev/null +++ b/inference/sglang/ds_offload_cpu.json @@ -0,0 +1,13 @@ +{ + "zero_optimization": { + "stage": 3, + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": "auto", + "offload_param": { + "device": "cpu", + "buffer_size": "auto" + } + }, + "train_batch_size": 1 +} diff --git a/inference/sglang/ds_offload_nvme_aio.json b/inference/sglang/ds_offload_nvme_aio.json new file mode 100644 index 000000000..268fbafc6 --- /dev/null +++ b/inference/sglang/ds_offload_nvme_aio.json @@ -0,0 +1,23 @@ +{ + "zero_optimization": { + "stage": 3, + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": "auto", + "offload_param": { + "device": "nvme", + "nvme_path": "/local_nvme/sglang", + "buffer_size": "auto", + "buffer_count": 5 + } + }, + "aio": { + "block_size": 8388608, + "queue_depth": 32, + "intra_op_parallelism": 8, + "single_submit": false, + "overlap_events": true, + "use_gds": false + }, + "train_batch_size": 1 +} diff --git a/inference/sglang/ds_offload_nvme_gds.json b/inference/sglang/ds_offload_nvme_gds.json new file mode 100644 index 000000000..479d28479 --- /dev/null +++ b/inference/sglang/ds_offload_nvme_gds.json @@ -0,0 +1,23 @@ +{ + "zero_optimization": { + "stage": 3, + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": "auto", + "offload_param": { + "device": "nvme", + "nvme_path": "/local_nvme/sglang", + "buffer_size": "auto", + "buffer_count": 3 + } + }, + "aio": { + "block_size": 8388608, + "queue_depth": 32, + "intra_op_parallelism": 8, + "single_submit": false, + "overlap_events": true, + "use_gds": true + }, + "train_batch_size": 1 +} diff --git a/inference/sglang/run_model.sh b/inference/sglang/run_model.sh index 3a93b09c8..29b3aad18 100644 --- a/inference/sglang/run_model.sh +++ b/inference/sglang/run_model.sh @@ -1,13 +1,15 @@ export LOCAL_RANK=0 DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct" -BATCH_SIZE=128 +BATCH_SIZE=1 # python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph MODEL_NAME="meta-llama/Llama-3.2-1B" -python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference -# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph MODEL_NAME="meta-llama/Meta-Llama-3.1-70B" From d84bb56869b0997a32b2138ef0f94ba0539a09c7 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 8 Apr 2025 07:10:43 -0400 Subject: [PATCH 37/40] Add pin_memory --- inference/sglang/ds_offload_cpu.json | 1 + inference/sglang/ds_offload_nvme_aio.json | 1 + inference/sglang/ds_offload_nvme_gds.json | 1 + 3 files changed, 3 insertions(+) diff --git a/inference/sglang/ds_offload_cpu.json b/inference/sglang/ds_offload_cpu.json index 9be11bc84..1c0438014 100644 --- a/inference/sglang/ds_offload_cpu.json +++ b/inference/sglang/ds_offload_cpu.json @@ -6,6 +6,7 @@ "stage3_max_live_parameters": "auto", "offload_param": { "device": "cpu", + "pin_memory": true, "buffer_size": "auto" } }, diff --git a/inference/sglang/ds_offload_nvme_aio.json b/inference/sglang/ds_offload_nvme_aio.json index 268fbafc6..71ea89438 100644 --- a/inference/sglang/ds_offload_nvme_aio.json +++ b/inference/sglang/ds_offload_nvme_aio.json @@ -7,6 +7,7 @@ "offload_param": { "device": "nvme", "nvme_path": "/local_nvme/sglang", + "pin_memory": true, "buffer_size": "auto", "buffer_count": 5 } diff --git a/inference/sglang/ds_offload_nvme_gds.json b/inference/sglang/ds_offload_nvme_gds.json index 479d28479..7f3784741 100644 --- a/inference/sglang/ds_offload_nvme_gds.json +++ b/inference/sglang/ds_offload_nvme_gds.json @@ -7,6 +7,7 @@ "offload_param": { "device": "nvme", "nvme_path": "/local_nvme/sglang", + "pin_memory": true, "buffer_size": "auto", "buffer_count": 3 } From db3b32b4df9b85d970b12c8a96f9f89d248d1c25 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 8 Apr 2025 07:22:42 -0400 Subject: [PATCH 38/40] Cleanup scripts --- inference/sglang/run_llama3_1B.sh | 11 +++++++++++ inference/sglang/run_llama3_70B.sh | 9 +++++++++ inference/sglang/{run_model.sh => run_llama3_8B.sh} | 10 +--------- 3 files changed, 21 insertions(+), 9 deletions(-) create mode 100644 inference/sglang/run_llama3_1B.sh create mode 100644 inference/sglang/run_llama3_70B.sh rename inference/sglang/{run_model.sh => run_llama3_8B.sh} (62%) diff --git a/inference/sglang/run_llama3_1B.sh b/inference/sglang/run_llama3_1B.sh new file mode 100644 index 000000000..a6a1f543f --- /dev/null +++ b/inference/sglang/run_llama3_1B.sh @@ -0,0 +1,11 @@ +export LOCAL_RANK=0 +DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" +BATCH_SIZE=128 +MODEL_NAME="meta-llama/Llama-3.2-1B" + +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph + + diff --git a/inference/sglang/run_llama3_70B.sh b/inference/sglang/run_llama3_70B.sh new file mode 100644 index 000000000..6e3949551 --- /dev/null +++ b/inference/sglang/run_llama3_70B.sh @@ -0,0 +1,9 @@ +export LOCAL_RANK=0 +DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" +BATCH_SIZE=128 +MODEL_NAME="meta-llama/Meta-Llama-3.1-70B" + +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json +python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json +# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph diff --git a/inference/sglang/run_model.sh b/inference/sglang/run_llama3_8B.sh similarity index 62% rename from inference/sglang/run_model.sh rename to inference/sglang/run_llama3_8B.sh index 29b3aad18..f203bdd4a 100644 --- a/inference/sglang/run_model.sh +++ b/inference/sglang/run_llama3_8B.sh @@ -1,17 +1,9 @@ export LOCAL_RANK=0 DATASET_OPTS="--dataset-name random --random-input-len 512 --random-output-len 32 --random-range-ratio 1.0" +BATCH_SIZE=128 MODEL_NAME="meta-llama/Meta-Llama-3.1-8B-Instruct" -BATCH_SIZE=1 -# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph - -MODEL_NAME="meta-llama/Llama-3.2-1B" python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_cpu.json python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_aio.json python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference-config ds_offload_nvme_gds.json python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph - - -MODEL_NAME="meta-llama/Meta-Llama-3.1-70B" -# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph --zero-inference -# python -m sglang.bench_offline_throughput --model-path ${MODEL_NAME} ${DATASET_OPTS} --num-prompts ${BATCH_SIZE} --disable-cuda-graph From 6ee91cb5297497ee1abdd281f58ba2036ad52faf Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 12 Apr 2025 14:17:38 -0400 Subject: [PATCH 39/40] SGLang README --- inference/sglang/README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/inference/sglang/README.md b/inference/sglang/README.md index 590d685f0..f904864dd 100644 --- a/inference/sglang/README.md +++ b/inference/sglang/README.md @@ -1 +1,12 @@ -# ZeRO-Inference SGLang examples +# SGLang + ZeRO-Inference Examples +This folder contains examples of [ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) integration into [SGLang](https://github.com/sgl-project/sglang) framework. This integration enable SGLang to inference massive models (e.g., with 100s billion parameters) on a single GPU through the NVMe/CPU offloading optimizations of ZeRO-Inference. + +## Prerequisites +1. DeepSpeed version >= [0.16.6](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.16.6) +2. SGLang: These examples require our SGLang [fork](https://github.com/tjruwase/sglang/tree/zero-inference). We plan to upstream the SGLang changes to main branch. + + +## Examples +The examples comprise of the following: +1. bash scripts that benchmark SGLang throughput in [offline mode](https://github.com/sgl-project/sglang/blob/main/python/sglang/bench_offline_throughput.py) with different ZeRO-Inference offloading options. Each script runs a inference on a different model with a prompt of 512 tokens, output of 32 tokens, and batch size of 128. +2. DeepSpeed config files corresponding to ZeRO-Inference offloading: (i) CPU offload, (ii) NVMe offload with AIO, and (iii) NVMe offloading with NVIDIA GDS. \ No newline at end of file From e283b7429f407f0d74ee50aed2ac122a82cf93db Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 12 Apr 2025 14:20:06 -0400 Subject: [PATCH 40/40] Remove file --- deepnvme/model_checkpoint/local_cufile.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 deepnvme/model_checkpoint/local_cufile.json diff --git a/deepnvme/model_checkpoint/local_cufile.json b/deepnvme/model_checkpoint/local_cufile.json deleted file mode 100644 index 7d4d9c8e3..000000000 --- a/deepnvme/model_checkpoint/local_cufile.json +++ /dev/null @@ -1 +0,0 @@ -{"execution": {"max_io_queue_depth": 8, "max_request_parallelism": 1, "max_io_threads": 1, "parallel_io": true, "min_io_threshold_size_kb": 8192}} \ No newline at end of file