NVIDIA · asolergi-nv · Mar 30, 2026 · Mar 30, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/docs/user-guide/features/tokenizers.md b/docs/user-guide/features/tokenizers.md
@@ -146,7 +146,24 @@ tokenizer = MegatronTokenizer.from_pretrained(
 
 ### Null Tokenizer
 
-Use a null tokenizer for testing or non-text models:
+The Null tokenizer is a lightweight, zero-I/O tokenizer that requires no model files.
+It is useful in three scenarios:
+
+1. **Performance benchmarking** with `--mock-data` where real tokenization is unnecessary.
+2. **Testing** in functional tests and CI pipelines where tokenizer model files may not
+   be available. The Null tokenizer removes the dependency on external files, making
+   tests self-contained and portable.
+3. **Pretraining with pretokenized data** where all data is already tokenized into
+   `.bin`/`.idx` files. In this case the tokenizer is only needed for metadata
+   (`vocab_size`, `eod`, `pad`) — not for actual tokenization. Using the Null tokenizer
+   avoids redundant filesystem access at scale, which is particularly beneficial on
+   shared filesystems like Lustre where thousands of ranks would otherwise all load the
+   same tokenizer files.
+
+Properties derived from `--vocab-size N`:
+- `vocab_size` = `N` (the exact value passed)
+- `eod` = `N - 1` (last token in the vocabulary)
+- `pad` = `0`
 
 ```python
 tokenizer = MegatronTokenizer.from_pretrained(
@@ -162,10 +179,20 @@ tokenizer = MegatronTokenizer.from_pretrained(
 The tokenizer system integrates seamlessly with Megatron-LM training:
 
 ```bash
-# Null tokenizer for testing
+# Null tokenizer for benchmarking with mock data
 torchrun --nproc_per_node=8 pretrain_gpt.py \
     --tokenizer-type NullTokenizer \
     --vocab-size 131072 \
+    --mock-data \
+    ...
+```
+
+```bash
+# Null tokenizer for pretraining with pretokenized data (no tokenizer files needed)
+torchrun --nproc_per_node=8 pretrain_gpt.py \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 128256 \
+    --data-path /path/to/pretokenized_data \
     ...
 ```
 
@@ -190,7 +217,7 @@ If `--tokenizer-metadata` is not specified, a default metadata file is generated
 | **SentencePiece** | Google's tokenizer | GPT-style models, custom vocabularies |
 | **TikToken** | OpenAI's tokenizer | GPT-3.5/GPT-4 style tokenization |
 | **Megatron** | Built-in tokenizers | Legacy GPT-2 BPE |
-| **Null** | No-op tokenizer | Testing, non-text modalities |
+| **Null** | Zero-I/O tokenizer | Benchmarking, pretokenized data |
 
 ## Common Tokenizer Types
 

@@ -11,10 +11,10 @@ class NullTokenizer:
         vocab_size: vocabulary size for embedding
     """
 
-    def __init__(self, vocab_size):
+    def __init__(self, vocab_size, **kwargs):
         """ """
-        self._vocab_size_without_eod = int(vocab_size)
-        self._eod_id = self._vocab_size_without_eod
+        self._vocab_size = int(vocab_size)
+        self._eod_id = self._vocab_size - 1
 
     def text_to_ids(self, text):
         """Converts text to ids."""
@@ -44,12 +44,17 @@ def offsets(self, ids: list[int], text: str) -> list[int]:
     @property
     def unique_identifiers(self) -> OrderedDict:
         """Property required for use with megatron-core datasets."""
-        return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"})
+        return OrderedDict(
+            {
+                "class": f"{type(self).__module__}.{type(self).__qualname__}",
+                "vocab_size": self._vocab_size,
+            }
+        )
 
     @property
     def vocab_size(self):
         """Returns vocab size."""
-        return self._vocab_size_without_eod + 1
+        return self._vocab_size
 
     @property
     def vocab(self):
@@ -81,6 +86,11 @@ def eod(self):
         """Returns eod token."""
         return self._eod_id
 
+    @property
+    def pad_id(self):
+        """Returns pad token."""
+        return 0
+
     @property
     def additional_special_tokens_ids(self):
         """ """

diff --git a/tests/functional_tests/test_cases/common/ckpt_converter/__main__.py b/tests/functional_tests/test_cases/common/ckpt_converter/__main__.py
@@ -579,7 +579,7 @@ def get_model_argv(self):
             "--tokenizer-type",
             "NullTokenizer",
             "--vocab-size",
-            "127",  # ... NullTokenizer adds +1 EOD token.
+            "128",
             "--make-vocab-size-divisible-by",
             "1",
         ]

diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp1_pp4_memory_speed/model_config.yaml
@@ -28,7 +28,7 @@ MODEL_ARGS:
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
   --tokenizer-type: NullTokenizer
-  --vocab-size: 131072
+  --vocab-size: 131073
   --mock-data: true
   --split: 949,50,1
   --distributed-backend: nccl

diff --git a/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_7b_tp4_pp1_memory_speed/model_config.yaml
@@ -28,7 +28,7 @@ MODEL_ARGS:
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
   --tokenizer-type: NullTokenizer
-  --vocab-size: 131072
+  --vocab-size: 131073
   --mock-data: true
   --split: 949,50,1
   --distributed-backend: nccl

diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2/model_config.yaml
@@ -20,8 +20,8 @@ MODEL_ARGS:
   --save: ${CHECKPOINT_SAVE_PATH}
   --load: ${CHECKPOINT_LOAD_PATH}
   --data-path: ${DATA_PATH}/text/common_pile/v01_filtered_data/my-gpt3_00_text_document
-  --vocab-file: ${DATA_PATH}/text/common_pile/v01_filtered_data/bpe/vocab.json
-  --merge-file: ${DATA_PATH}/text/common_pile/v01_filtered_data/bpe/merges.txt
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 50257
   --split: 949,50,1
   --distributed-backend: nccl
   --lr: 0.00015

diff --git a/...nal_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/model_config.yaml b/...nal_tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp1_pp1/model_config.yaml
@@ -25,7 +25,7 @@ MODEL_ARGS:
   --load: ${CHECKPOINT_LOAD_PATH}
   --split: 949,50,1
   --tokenizer-type: NullTokenizer
-  --vocab-size: 8192
+  --vocab-size: 8193
   --distributed-backend: nccl
   --lr: 0.00015
   --lr-decay-style: cosine

diff --git a/..._tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/model_config.yaml b/..._tests/test_cases/multimodal-llava/multimodal_llava_mcore_te_tp4_sp_cp2/model_config.yaml
@@ -27,7 +27,7 @@ MODEL_ARGS:
   --load: ${CHECKPOINT_LOAD_PATH}
   --split: 949,50,1
   --tokenizer-type: NullTokenizer
-  --vocab-size: 8192
+  --vocab-size: 8193
   --distributed-backend: nccl
   --lr: 0.00015
   --lr-decay-style: cosine

diff --git a/tests/unit_tests/tokenizers/test_tokenizer.py b/tests/unit_tests/tokenizers/test_tokenizer.py
@@ -275,7 +275,9 @@ def test_null_tokenizer():
     ids = tokenizer.tokenize("11 325 97")
 
     assert ids == [11, 325, 97]
-    assert tokenizer.vocab_size == 131073
+    assert tokenizer.vocab_size == 131072
+    assert tokenizer.eod == 131071
+    assert tokenizer.pad == 0
 
 
 @pytest.mark.parametrize("skip_special_tokens", [True, False])