From c36125567e2923a3dd4ca723687a47f4a3ec5202 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 14 May 2026 17:23:32 +0200 Subject: [PATCH 01/18] tokenizer: added clamp init option for optimal tokenizer size --- src/gpt_lab/model/auto.py | 20 ++-- src/gpt_lab/tokenizer/tokenizer.py | 143 +++++++++++++++++++---------- 2 files changed, 107 insertions(+), 56 deletions(-) diff --git a/src/gpt_lab/model/auto.py b/src/gpt_lab/model/auto.py index c113171..ec7640f 100644 --- a/src/gpt_lab/model/auto.py +++ b/src/gpt_lab/model/auto.py @@ -32,7 +32,7 @@ class AutoGPTConfig(BaseModel): # Tokenizer config # If None, will be set to vocab size scaling law based on Tao et al. 2O24 (https://arxiv.org/abs/2407.13623) - tokenizer_model: Optional[str] = None # none, auto, or name + tokenizer_model: Optional[str] = None # none, auto, or clamp train_tokenizer: bool = False vocab_size: int = -1 pat_str: Optional[str] = None @@ -85,11 +85,16 @@ def generate_gpt_config(self, device) -> MetaConfig: def _get_tokenizer_pretrained(tname: str, source: str = "tiktoken") -> Tokenizer: # TODO: need to be simplified and optimized # if a specific tokenizer model is specified, we will use it and ignore the scaling law - try: + try: + return Tokenizer.from_pretrained(tname, source=source) + except Exception as e1: + log0(f"Error occurred while loading tokenizer model {tname} from {source}. Error: {e1}", logger=logger, level="warning") + try: _tconfig = TokenizerConfig(name=tname, source=source, vocab_size=-1, special_tokens=special_tokens, pat_str="") - tokenizer = Tokenizer.from_config(_tconfig) - except Exception as e: - error(f"Error occurred while loading tokenizer model {self.tokenizer_model} from {source}. Error: {e}", logger=logger) + return Tokenizer.from_config(_tconfig) + except Exception as e2: + log0(f"Error occurred while loading tokenizer model {tname} from {source}. Error: {e2}", logger=logger, level="warning") + try: _tconfig = TokenizerConfig.from_directory(name=tname) _mergeable_ranks = _tconfig.get_mergeable_ranks() tokenizer = Tokenizer( @@ -97,9 +102,8 @@ def _get_tokenizer_pretrained(tname: str, source: str = "tiktoken") -> Tokenizer special_tokens=special_tokens, config=_tconfig ) - except Exception as e: - raise ValueError(f"Could not load tokenizer model {self.tokenizer_model} from either tiktoken or local cache. Error: {e}") - return tokenizer + except Exception as e3: + log_error(f"Error occurred while loading tokenizer model {tname} from local cache. Error: {e3}", logger=logger, error_type=ValueError) def build_meta_model_from_depth(depth: int, vocab_size: int = -1) -> DenseTransformer: # Initiate instance of optimized GPTConfig to access default values and methods diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 3cd3718..94bb2a8 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -1,17 +1,20 @@ -from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig -import tiktoken -from tokenizers import Tokenizer as HFTokenizer +from __future__ import annotations + import torch -from typing import Callable, Iterable, List, Optional, Union, Tuple +import random, json, os, csv + import pickle from pathlib import Path -import random, json, os, csv + +from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig from gpt_lab.utils.default import TOKENIZERS_FOLDER from gpt_lab.utils.special_tokens import SpecialTokens +from gpt_lab.utils.logging import log0, log_error + +import tiktoken from tokenizers import Tokenizer as HFTokenizer -from gpt_lab.utils.common import print0, print0_dict -from gpt_lab.utils.logging import log0, log_error +from typing import Callable, Iterable, List, Optional, Union, Tuple, Dict import logging logger = logging.getLogger(__name__) @@ -19,11 +22,9 @@ # ------------------------------------------------------------ # FACTORY FUNCTION TO BUILD TOKENIZER FROM CONFIG # ------------------------------------------------------------ - -def get_closest_tokenizer_size(vocab_size: int) -> Tuple[str, int]: - """Get the closest tokenizer size from the cache based on the provided vocab size.""" - # TODO: i removed tokenizer cache manager in a previous commit - # have to make it back in order to make this works with new tokenizers + +def _get_tokenizer_sizes_in_cache() -> Dict[str, int]: + """Get the vocab sizes of all tokenizers in the cache based on the provided tokenizer name.""" tiktoken_encs = { name: len(tiktoken.get_encoding(name)._mergeable_ranks) for name in ("gpt2", "cl100k_base", "o200k_base") @@ -41,6 +42,25 @@ def get_closest_tokenizer_size(vocab_size: int) -> Tuple[str, int]: df_tok_cache[name] = int(vocab) tokenizer_sizes = {**tiktoken_encs, **df_tok_cache} + return tokenizer_sizes + +def get_higher_closest_tokenizer_size(vocab_size: int) -> Tuple[str, int]: + """Get the next higher tokenizer size from the cache based on the provided vocab size.""" + tokenizer_sizes = _get_tokenizer_sizes_in_cache() + higher_tokenizers = { name: size for name, size in tokenizer_sizes.items() if size >= vocab_size } + if not higher_tokenizers: + raise ValueError(f"No tokenizer found with vocab size ≥ {vocab_size}.") + tok_name, closest_size = min( + higher_tokenizers.items(), + key=lambda x: x[1] + ) + return tok_name, closest_size + +def get_closest_tokenizer_size(vocab_size: int) -> Tuple[str, int]: + """Get the closest tokenizer size from the cache based on the provided vocab size.""" + # TODO: i removed tokenizer cache manager in a previous commit + # have to make it back in order to make this works with new tokenizers + tokenizer_sizes = _get_tokenizer_sizes_in_cache() tok_name, closest_size = min( tokenizer_sizes.items(), key=lambda x: abs(x[1] - vocab_size) @@ -148,13 +168,11 @@ def encode(self, text, *args, **kwargs): def decode(self, tokens, *args, **kwargs): return "".join([chr(t) for t in tokens]) - # ------------------------------------------------------------ # HUGGINGFACE TOKENIZER WRAPPER (for some utilities) # ------------------------------------------------------------ - class HuggingFaceTokenizerWrapper(_BaseTokenizer): """Light wrapper around HuggingFace Tokenizer for some utilities""" @@ -356,6 +374,7 @@ def from_config(cls, config: TokenizerConfig): config=config ) + def get_bos_token_id(self): return self.bos_token_id @@ -453,41 +472,11 @@ def merge_to_bytes(merge): tokenizer = cls( mergeable_ranks=mergeable_ranks, special_tokens=special_tokens, - # special_tokens=config.special_tokens.list(), config=config ) - if config.to_save: tokenizer.save_to_directory() return tokenizer - - # def get_token_bytes(self): - # token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" - # if getattr(self, 'token_bytes', None) is not None: - # return self.token_bytes - - # if token_bytes_path.exists(): - # token_bytes = torch.load(token_bytes_path) - # print(f"Loaded token_bytes from {token_bytes_path}.") - # else: - # vocab_size = self.vocab_size - # special_set = set(self.special_tokens) - # token_strings = [self.decode([token_id]) for token_id in range(vocab_size)] - # token_bytes = [] - # for token_id in range(vocab_size): - # token_str = token_strings[token_id] # the Python string representation of this token - # if token_str in special_set: - # token_bytes.append(0) # special characters are not counted - # else: - # id_bytes = len(token_str.encode("utf-8")) # number of bytes that make up this token - # token_bytes.append(id_bytes) - # token_bytes = torch.tensor(token_bytes, dtype=torch.int32, device='cpu') - # with open(token_bytes_path, "wb") as f: - # torch.save(token_bytes, f) - # print(f"Saved token_bytes to {token_bytes_path}") - # # self.token_bytes_cache = token_bytes - # # self.token_bytes = token_bytes - # return token_bytes @classmethod def from_disk(cls, name: str, cachedir: Optional[Union[str, Path]] = None): @@ -498,16 +487,74 @@ def from_disk(cls, name: str, cachedir: Optional[Union[str, Path]] = None): # dirname = cachedir / name config = TokenizerConfig.from_directory(name, cachedir=cachedir) mergeable_ranks = config.get_mergeable_ranks() - log0(f"Loaded tokenizer config from {name} with vocab size {len(mergeable_ranks) + len(config.special_tokens)}", logger=logger) - # vocab_path = dirname / "vocab.pkl" - # with open(vocab_path, "rb") as vf: - # mergeable_ranks = pickle.load(vf) + log0(f"Loaded tokenizer config from {name} with vocab size " + f"{len(mergeable_ranks) + len(config.special_tokens)}", + logger=logger) return cls( mergeable_ranks=mergeable_ranks, special_tokens=config.special_tokens.list(), config=config ) + + + @classmethod + def clamped_from_pretrained(cls, name: str, new_vocab_size: int, source: str = "tiktoken") -> Tokenizer: + """Return a new Tokenizer with vocab clamped to new_vocab_size.""" + new_name = f"{tokenizer.config.name}_clamped_{new_vocab_size}" + try: + tokenizer = cls.from_disk(new_name) + log0(f"Found existing clamped tokenizer {new_name} on disk, loading it instead of creating a new one.", logger=logger) + return tokenizer + except Exception as e: + log0(f"No existing clamped tokenizer found on disk for {new_name}. Creating a new one by clamping the pretrained tokenizer {name}. Error: {e}", logger=logger, level="warning") + tokenizer = cls.from_pretrained(name, source=source) + if new_vocab_size >= tokenizer.vocab_size: + return tokenizer + + merges_to_remove = tokenizer.vocab_size - new_vocab_size + token_to_id = dict(tokenizer.token_to_id) # shallow copy, don't mutate original + for merge, _ in sorted(token_to_id.items(), key=lambda x: x[1], reverse=True): + if merges_to_remove <= 0: + break + del token_to_id[merge] + merges_to_remove -= 1 + config = TokenizerConfig( + name=f"{tokenizer.config.name}_clamped_{new_vocab_size}", + source=tokenizer.config.source, + dirname=tokenizer.config.dirname.parent / new_name, + vocab_size=new_vocab_size, + pat_str=tokenizer.config.pat_str, + special_tokens=tokenizer.config.special_tokens, + source=tokenizer.config.source + ) + tokenizer = cls( + mergeable_ranks=token_to_id, + special_tokens=list(config.special_tokens.values()), + config=config, + ) + tokenizer.update_token_bytes() + return tokenizer + @classmethod + def get_closest_clamped_from_pretrained(cls, tokenizer: Tokenizer, target_vocab_size: int) -> Tokenizer: + # Find the closest vocab size that is less than or equal to the target + closest_vocab_size = min( + vocab_size for vocab_size in tokenizer.config.vocab_sizes + if vocab_size <= target_vocab_size + ) + return cls.clamped_from_pretrained(tokenizer.config.name, closest_vocab_size) + + def update_token_bytes(self): + if len(self.token_to_id) == self.vocab_size - len(self.special_tokens): + log0("No merges were removed, token bytes remain the same.", logger=logger) + return + self.token_bytes = self.get_token_bytes() + self.token_bytes = torch.cat([ + self.token_bytes[:len(self.token_to_id)], # surviving merges, byte counts intact + torch.zeros(len(self.special_tokens), dtype=torch.int32) # special tokens always 0 + ]) + # Recompute token bytes and save to disk, useful after clamping the vocab + def save_to_directory(self, directory: Optional[Union[str, Path]] = None): # Save the tokenizer's merges and vocab to the specified directory if directory is None: From 904cdf692c307f7d8031ff84f6e605dc3c180e96 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 14 May 2026 17:26:29 +0200 Subject: [PATCH 02/18] tokenizer: fix test --- src/gpt_lab/tokenizer/tokenizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 94bb2a8..fb629f0 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -524,8 +524,7 @@ def clamped_from_pretrained(cls, name: str, new_vocab_size: int, source: str = " dirname=tokenizer.config.dirname.parent / new_name, vocab_size=new_vocab_size, pat_str=tokenizer.config.pat_str, - special_tokens=tokenizer.config.special_tokens, - source=tokenizer.config.source + special_tokens=tokenizer.config.special_tokens ) tokenizer = cls( mergeable_ranks=token_to_id, From a3e536b6648d224d281c4ecf061810c2be04a042 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Sat, 16 May 2026 00:00:39 +0200 Subject: [PATCH 03/18] tokenizer encoder note --- src/gpt_lab/tokenizer/tokenizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index fb629f0..20e1837 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -582,6 +582,8 @@ def encode( text: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]: + # NOTE: maybe it would be better to unfused both str and list encoding into separate methods to avoid confusion + # and potential bugs with the different options (e.g. prepend_bos, unsqueeze) that may not be compatible with both modes? prepend_bos = kwargs.pop("prepend_bos", False) unsqueeze = kwargs.pop("unsqueeze", False) if isinstance(text, str): From b27cb51e54b899ff36b042fc4b59464145faad0c Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Sat, 16 May 2026 12:26:39 +0200 Subject: [PATCH 04/18] tokenizer: clamp to truncate --- scripts/train_base.py | 2 +- src/gpt_lab/tokenizer/tokenizer.py | 89 ++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/scripts/train_base.py b/scripts/train_base.py index 22fafbe..71f83a0 100644 --- a/scripts/train_base.py +++ b/scripts/train_base.py @@ -112,7 +112,7 @@ def get_common_arguments(prs: ArgumentParser): auto_parser.add_argument("--vocab-size", type=int, default=-1, help="(default: -1) Vocabulary size for auto-configured models. If not set, will be determined by vocab size scaling law based on model depth.") auto_parser.add_argument("--pat-str", type=str, default=None, help="(default: None) Split pattern for pre-tokenization if training a new-tokenizer. Options are 'gpt2, 'gpt4', 'cl100k_base', 'o200k_base', or directly the pattern string. If not set, will default to 'gpt2' pattern.") auto_parser.add_argument("--train-tokenizer", action="store_true", help="(default: False) Whether to train a new tokenizer from scratch.") - auto_parser.add_argument("--clamp-tokenizer", action="store_true", help="(default: False) Whether to clamp tokenizer values.") + auto_parser.add_argument("--truncate-tokenizer", action="store_true", help="(default: False) Whether to truncate tokenizer values.") ## Model arguments auto_parser.add_argument("--depth", type=int, default=12, help="(default: 12) Number of model layers.") diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 20e1837..4dcfbc5 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -86,7 +86,7 @@ class _BaseTokenizer: def __init__(self, config: Optional[TokenizerConfig] = None): self.config = config self.special_tokens = None - self.token_to_id = None + self.mergeable_ranks = None try: self.token_bytes = self.get_token_bytes() except Exception as e: @@ -96,11 +96,20 @@ def __init__(self, config: Optional[TokenizerConfig] = None): level="warning", logger=logger) def get_vocab(self): - return {**self.token_to_id, **self.special_tokens} + return {**self.mergeable_ranks, **self.special_tokens} @property def vocab_size(self): - return len(self.token_to_id) + len(self.special_tokens) + "vocab_size value icludes both mergeable ranks and special tokens" + return len(self.mergeable_ranks) + len(self.special_tokens) + + @property + def n_special_tokens(self): + return len(self.special_tokens) + + @property + def n_ranks(self): + return len(self.mergeable_ranks) def get_token_bytes(self): token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" @@ -148,13 +157,13 @@ def __init__(self, config: Optional[TokenizerConfig] = None): ) super().__init__(config) n_merges = config.vocab_size - len(config.special_tokens) - self.token_to_id = { bytes([i]): i for i in range(min(256, n_merges)) } + self.mergeable_ranks = { bytes([i]): i for i in range(min(256, n_merges)) } if n_merges > 256: for i in range(256, n_merges): # make merges deterministic at least lol merge1 = bytes([i // 256]) merge2 = bytes([i % 256]) - self.token_to_id[merge1+merge2] = i + self.mergeable_ranks[merge1+merge2] = i self.special_tokens = config.special_tokens self.bos_token_id = config.vocab_size @@ -189,7 +198,7 @@ def special_tokens(self): @classmethod def from_pretrained(cls, hf_path): # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") - tokenizer = HFTokenizer.from_config(hf_path) + tokenizer = HFTokenizer.from_pretrained(hf_path) config = TokenizerConfig( name=hf_path, source="huggingface", @@ -233,7 +242,7 @@ def _encode_one(self, text, prepend=None, append=None, num_threads=None): def encode_special(self, text): # encode a single special token via exact match - return self.main.token_to_id(text) + return self.main.mergeable_ranks(text) def get_bos_token_id(self): # Different HuggingFace models use different BOS tokens and there is little consistency @@ -268,7 +277,6 @@ def save(self, tokenizer_dir): self.main.save(tokenizer_path) print(f"Saved tokenizer to {tokenizer_path}") - # ------------------------------------------------------------ # MAIN TOKENIZER CLASS # - train with huggingface, @@ -299,7 +307,7 @@ def __init__( ): super().__init__(config=config) special_tokens = { sp: rank + len(mergeable_ranks) for rank, sp in enumerate(special_tokens) } - self.token_to_id = mergeable_ranks + self.mergeable_ranks = mergeable_ranks self.main = tiktoken.Encoding( name=config.name, pat_str=config.pat_str, @@ -498,28 +506,48 @@ def from_disk(cls, name: str, cachedir: Optional[Union[str, Path]] = None): @classmethod - def clamped_from_pretrained(cls, name: str, new_vocab_size: int, source: str = "tiktoken") -> Tokenizer: - """Return a new Tokenizer with vocab clamped to new_vocab_size.""" - new_name = f"{tokenizer.config.name}_clamped_{new_vocab_size}" + def truncated_from_pretrained(cls, name: str, new_vocab_size: int, source: str = "tiktoken") -> Tokenizer: + """Construct a tokenizer by truncating the merge-rank table + of a pretrained tokenizer. + + The tokenizer retains: + - all primitive byte tokens + - the earliest merge rules + - all special tokens + + and removes later BPE merges according to merge rank order. + + Because BPE merge ranks are constructed incrementally, + keeping the first K mergeable ranks preserves a valid + prefix of the original tokenizer. + """ + new_name = f"{name}_truncated_{new_vocab_size}" + try: tokenizer = cls.from_disk(new_name) - log0(f"Found existing clamped tokenizer {new_name} on disk, loading it instead of creating a new one.", logger=logger) + log0(f"Found existing truncated tokenizer {new_name} on disk, loading it instead of creating a new one.", logger=logger) return tokenizer except Exception as e: - log0(f"No existing clamped tokenizer found on disk for {new_name}. Creating a new one by clamping the pretrained tokenizer {name}. Error: {e}", logger=logger, level="warning") + log0(f"No existing truncated tokenizer found on disk for {new_name}. Creating a new one by truncating the pretrained tokenizer {name}. Error: {e}", logger=logger, level="warning") + tokenizer = cls.from_pretrained(name, source=source) + if new_vocab_size - len(tokenizer.special_tokens) < 256: + msg = f"New vocab size {new_vocab_size} is too small to fit all single byte tokens and special tokens. Minimum vocab size is {256 + len(tokenizer.special_tokens)}." + log_error(msg, error_type=ValueError, logger=logger) + if new_vocab_size >= tokenizer.vocab_size: return tokenizer merges_to_remove = tokenizer.vocab_size - new_vocab_size - token_to_id = dict(tokenizer.token_to_id) # shallow copy, don't mutate original - for merge, _ in sorted(token_to_id.items(), key=lambda x: x[1], reverse=True): + mergeable_ranks = dict(tokenizer.mergeable_ranks) # shallow copy, don't mutate original + for merge, _ in sorted(mergeable_ranks.items(), key=lambda x: x[1], reverse=True): if merges_to_remove <= 0: break - del token_to_id[merge] + del mergeable_ranks[merge] merges_to_remove -= 1 + config = TokenizerConfig( - name=f"{tokenizer.config.name}_clamped_{new_vocab_size}", + name=f"{tokenizer.config.name}_truncated_{new_vocab_size}", source=tokenizer.config.source, dirname=tokenizer.config.dirname.parent / new_name, vocab_size=new_vocab_size, @@ -527,33 +555,38 @@ def clamped_from_pretrained(cls, name: str, new_vocab_size: int, source: str = " special_tokens=tokenizer.config.special_tokens ) tokenizer = cls( - mergeable_ranks=token_to_id, - special_tokens=list(config.special_tokens.values()), + mergeable_ranks=mergeable_ranks, + special_tokens=config.special_tokens.list(), config=config, ) tokenizer.update_token_bytes() return tokenizer @classmethod - def get_closest_clamped_from_pretrained(cls, tokenizer: Tokenizer, target_vocab_size: int) -> Tokenizer: + def get_closest_truncated_from_pretrained(cls, tokenizer: Tokenizer, target_vocab_size: int) -> Tokenizer: # Find the closest vocab size that is less than or equal to the target closest_vocab_size = min( vocab_size for vocab_size in tokenizer.config.vocab_sizes if vocab_size <= target_vocab_size ) - return cls.clamped_from_pretrained(tokenizer.config.name, closest_vocab_size) + return cls.truncated_from_pretrained(tokenizer.config.name, closest_vocab_size) def update_token_bytes(self): - if len(self.token_to_id) == self.vocab_size - len(self.special_tokens): + if not hasattr(self, 'token_bytes'): + self.token_bytes = self.get_token_bytes() + if len(self.token_bytes) == len(self.mergeable_ranks) + len(self.special_tokens): log0("No merges were removed, token bytes remain the same.", logger=logger) return + old_vocab_size = len(self.token_bytes) self.token_bytes = self.get_token_bytes() self.token_bytes = torch.cat([ - self.token_bytes[:len(self.token_to_id)], # surviving merges, byte counts intact - torch.zeros(len(self.special_tokens), dtype=torch.int32) # special tokens always 0 + self.token_bytes[:len(self.mergeable_ranks)], # surviving merges, byte counts intact + torch.zeros(len(self.special_tokens), dtype=torch.int32) # special tokens always 0 ]) - # Recompute token bytes and save to disk, useful after clamping the vocab - + log0(f"Updated token bytes after truncation from {old_vocab_size:,} to {self.vocab_size:,}", logger=logger) + # Recompute token bytes and save to disk, useful after truncating the vocab + self.config.save_to_directory() + def save_to_directory(self, directory: Optional[Union[str, Path]] = None): # Save the tokenizer's merges and vocab to the specified directory if directory is None: @@ -571,7 +604,7 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): vocab_path = directory / "vocab.pkl" with open(vocab_path, "wb") as vf: - pickle.dump(self.token_to_id, vf) + pickle.dump(self.mergeable_ranks, vf) log0(f"Saved tokenizer vocab to {vocab_path}", logger=logger) def encode_special(self, token: str) -> int: From e605beb6bac2cd749e76b30ae83689c311363b90 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Sun, 17 May 2026 23:57:43 +0200 Subject: [PATCH 05/18] list special tokens --- src/gpt_lab/tokenizer/tokenizer.py | 6 ++---- src/gpt_lab/utils/special_tokens.py | 3 +++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 253a841..2268ddc 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -285,7 +285,7 @@ def save(self, tokenizer_dir): # ------------------------------------------------------------ class Tokenizer(_BaseTokenizer): - """ Wrapper class for different tokenizer implementations + """Wrapper class for different tokenizer implementations ## Use cases include: - Encoding with Tiktoken API (faster) - Loading TikToken tokenizer @@ -381,7 +381,6 @@ def from_config(cls, config: TokenizerConfig): special_tokens=special_tokens, config=config ) - def get_bos_token_id(self): return self.bos_token_id @@ -501,7 +500,6 @@ def from_disk(cls, name: str, cachedir: Optional[Union[str, Path]] = None): special_tokens=config.special_tokens.list(), config=config ) - @classmethod def truncated_from_pretrained(cls, name: str, new_vocab_size: int, source: str = "tiktoken") -> Tokenizer: @@ -545,7 +543,7 @@ def truncated_from_pretrained(cls, name: str, new_vocab_size: int, source: str = merges_to_remove -= 1 config = TokenizerConfig( - name=f"{tokenizer.config.name}_truncated_{new_vocab_size}", + name=new_name, source=tokenizer.config.source, dirname=tokenizer.config.dirname.parent / new_name, vocab_size=new_vocab_size, diff --git a/src/gpt_lab/utils/special_tokens.py b/src/gpt_lab/utils/special_tokens.py index 67ae5f0..f42c887 100644 --- a/src/gpt_lab/utils/special_tokens.py +++ b/src/gpt_lab/utils/special_tokens.py @@ -24,6 +24,9 @@ def list(self) -> List[str]: """ return [v for v in self.__dict__.values() if v is not None] + def __list__(self): + return self.list() + def dict(self) -> dict: return {k: v for k, v in self.__dict__.items() if v is not None} From 2b7ac793e957f309d334bda152a56dd2d33166ae Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Mon, 18 May 2026 00:28:47 +0200 Subject: [PATCH 06/18] tokenizer refactoring: serialization + truncation --- pyproject.toml | 1 + src/gpt_lab/tokenizer/serialization.py | 85 +++++++++ src/gpt_lab/tokenizer/tokenizer.py | 239 +++++++++++++++---------- src/gpt_lab/tokenizer/truncation.py | 123 +++++++++++++ uv.lock | 19 ++ 5 files changed, 375 insertions(+), 92 deletions(-) create mode 100644 src/gpt_lab/tokenizer/serialization.py create mode 100644 src/gpt_lab/tokenizer/truncation.py diff --git a/pyproject.toml b/pyproject.toml index 1016625..801183b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "gradio==3.18.0", "jinja2==3.1.6", "kernels==0.11.7", + "msgpack>=1.1.2", "numpy==2.4.3", "psutil==7.2.2", "pydantic==2.12.5", diff --git a/src/gpt_lab/tokenizer/serialization.py b/src/gpt_lab/tokenizer/serialization.py new file mode 100644 index 0000000..0c683ba --- /dev/null +++ b/src/gpt_lab/tokenizer/serialization.py @@ -0,0 +1,85 @@ +""" +Phase 1 (serialization): deterministic msgpack-based tokenizer persistence. + +Introduced in Phase 1 of the tokenizer refactor. This module provides +functions to save/load/validate `mergeable_ranks` in a deterministic, +portable, and fingerprinted format using `msgpack` and `sha256`. +""" +from __future__ import annotations + +import msgpack +from hashlib import sha256 +from pathlib import Path +from typing import Dict + +def save_mergeable_ranks(path: Path, mergeable_ranks: Dict[bytes, int]) -> str: + """Save mergeable_ranks deterministically (rank-sorted) to `path`. + + Returns the sha256 hex fingerprint of the written payload. + """ + sorted_items = sorted(mergeable_ranks.items(), key=lambda x: x[1]) + # ensure insertion order is by rank + sorted_dict = {k: v for k, v in sorted_items} + + payload = msgpack.packb({"version": 1, "mergeable_ranks": sorted_dict}, use_bin_type=True) + fingerprint = sha256(payload).hexdigest() + + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "wb") as f: + f.write(payload) + + return fingerprint + + +def load_mergeable_ranks(path: Path) -> Dict[bytes, int]: + """Load mergeable_ranks from a msgpack file written by `save_mergeable_ranks`. + + Raises ValueError on unsupported version or malformed payload. + """ + raw = path.read_bytes() + data = msgpack.unpackb(raw, raw=False) + + version = data.get("version") + if not isinstance(version, int): + raise ValueError("Tokenizer file 'version' must be an integer.") + if version != 1: + raise ValueError(f"Unsupported tokenizer file version: {version}. Expected 1.") + + raw_map = data.get("mergeable_ranks") + if raw_map is None: + raise ValueError("Missing 'mergeable_ranks' in tokenizer file.") + + # When unpacked with raw=False keys are bytes + mergeable_ranks: Dict[bytes, int] = {k: v for k, v in raw_map.items()} + return mergeable_ranks + + +def validate_mergeable_ranks(mergeable_ranks: Dict[bytes, int]) -> None: + """Validate rank semantics (non-empty, byte keys, integer contiguous ranks). + + Raises AssertionError with informative messages on failure. + """ + ranks = list(mergeable_ranks.values()) + + assert len(ranks) > 0, "Tokenizer cannot have empty mergeable_ranks." + assert all(isinstance(k, (bytes, bytearray)) and len(k) > 0 for k in mergeable_ranks.keys()), ( + "All mergeable rank keys must be non-empty bytes." + ) + assert all(isinstance(r, int) for r in ranks), "All mergeable rank values must be integers." + assert all(0 <= r < 2 ** 31 for r in ranks), "All ranks must satisfy 0 <= rank < 2**31." + assert len(set(ranks)) == len(ranks), "Duplicate ranks in mergeable_ranks." + assert min(ranks) == 0, "Ranks do not start at 0." + assert max(ranks) == len(ranks) - 1, "Ranks are not contiguous." + + +def validate_no_special_token_overlap(mergeable_ranks: Dict[bytes, int], special_tokens: Dict[str, int]) -> None: + """Ensure no special token (string) collides with a mergeable rank key (bytes). + + Raises AssertionError if overlap detected. + """ + mergeable_set = set(mergeable_ranks.keys()) + special_token_bytes = {t.encode("utf-8") for t in special_tokens.keys()} + overlap = mergeable_set & special_token_bytes + assert not overlap, ( + f"Special tokens overlap with mergeable ranks: {[b.decode('utf-8', errors='replace') for b in overlap]}" + ) diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 2268ddc..228f5c8 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -4,6 +4,15 @@ import random, json, os, csv import pickle +import warnings +import json as _json +from hashlib import sha256 +from gpt_lab.tokenizer.serialization import ( + save_mergeable_ranks, + load_mergeable_ranks, + validate_mergeable_ranks, + validate_no_special_token_overlap, +) from pathlib import Path from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig @@ -18,6 +27,7 @@ import logging logger = logging.getLogger(__name__) +from gpt_lab.tokenizer.truncation import parse_truncated_name # ------------------------------------------------------------ # FACTORY FUNCTION TO BUILD TOKENIZER FROM CONFIG @@ -113,29 +123,26 @@ def n_ranks(self): def get_token_bytes(self): token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" - if getattr(self, 'token_bytes', None) is not None: - return self.token_bytes_cache - + if getattr(self, "token_bytes", None) is not None: + return self.token_bytes + if token_bytes_path.exists(): token_bytes = torch.load(token_bytes_path) log0(f"Loaded token_bytes from {token_bytes_path}", logger=logger) else: - vocab_size = self.vocab_size - special_set = set(self.special_tokens) - token_strings = [self.decode([token_id]) for token_id in range(vocab_size)] - token_bytes = [] - for token_id in range(vocab_size): - token_str = token_strings[token_id] # the Python string representation of this token - if token_str in special_set: - token_bytes.append(0) # special characters are not counted - else: - id_bytes = len(token_str.encode("utf-8")) # number of bytes that make up this token - token_bytes.append(id_bytes) - token_bytes = torch.tensor(token_bytes, dtype=torch.int32, device='cpu') + # Compute byte lengths directly from mergeable_ranks keys (which are bytes) + mergeable = self.mergeable_ranks or {} + # Sort by rank to produce deterministic ordering + sorted_items = sorted(mergeable.items(), key=lambda x: x[1]) + token_bytes_list = [len(token) for token, _ in sorted_items] + # Special tokens are always zero-length for token_bytes + token_bytes_list.extend([0] * len(self.special_tokens)) + token_bytes = torch.tensor(token_bytes_list, dtype=torch.int32, device="cpu") with open(token_bytes_path, "wb") as f: torch.save(token_bytes, f) log0(f"Saved token_bytes to {token_bytes_path}", logger=logger) - self.token_bytes_cache = token_bytes + + self.token_bytes = token_bytes return token_bytes def __call__(self, text, *args, **kwds): @@ -325,13 +332,51 @@ def __init__( def from_pretrained(cls, name: str, source: Optional[str] = None, special_tokens: Optional[SpecialTokens] = None): if special_tokens is None: special_tokens = SpecialTokens() + # Phase 3: handle truncated names early to allow creating or loading + # pre-truncated tokenizers. This must run before any source-dispatch logic. + truncated = parse_truncated_name(name) + if truncated is not None: + base_name, vocab_size = truncated + try: + return cls.from_disk(name) + except FileNotFoundError: + # Expected: not cached yet. Build from base tokenizer. + return cls.truncated_from_pretrained( + base_name, + vocab_size, + source=source or "tiktoken", + special_tokens=special_tokens, + ) + # Do not catch other exceptions here; allow them to propagate. if source is None: + # Build expected-missing exception tuple dynamically, adding any + # tiktoken-specific exception if it can be discovered at runtime. + EXPECTED_MISSING = (FileNotFoundError, KeyError) + try: + import tiktoken + try: + # probe tiktoken for its missing-encoding exception type + tiktoken.get_encoding("__NONEXISTENT_ENCODING__") + except Exception as e: + EXPECTED_MISSING = tuple(set(EXPECTED_MISSING) | {type(e)}) + except Exception: + # tiktoken not available in this environment; proceed with defaults + pass + for source in ("tiktoken", "huggingface", "local"): try: return cls.from_pretrained(name, source=source, special_tokens=special_tokens) - except Exception as e: - print(f"Failed to load tokenizer from source {source} with error: {e}. Trying next source...") - raise ValueError(f"Failed to load tokenizer {name} from all sources.") + except EXPECTED_MISSING as e: + logger.debug(f"Source {source!r} not applicable for {name!r}: {e}") + continue + except Exception: + logger.debug( + f"Unexpected error loading {name!r} from {source!r}:", + exc_info=True, + ) + raise + + raise ValueError(f"Failed to load tokenizer {name!r} from all sources.") elif source == "tiktoken": enc = tiktoken.get_encoding(name) mergeable_ranks = enc._mergeable_ranks @@ -491,10 +536,37 @@ def from_disk(cls, name: str, cachedir: Optional[Union[str, Path]] = None): cachedir = Path(cachedir) # dirname = cachedir / name config = TokenizerConfig.from_directory(name, cachedir=cachedir) - mergeable_ranks = config.get_mergeable_ranks() - log0(f"Loaded tokenizer config from {name} with vocab size " - f"{len(mergeable_ranks) + len(config.special_tokens)}", - logger=logger) + + directory = Path(config.dirname) + msgpack_path = directory / "mergeable_ranks.msgpack" + pkl_path = directory / "vocab.pkl" + + if msgpack_path.exists(): + mergeable_ranks = load_mergeable_ranks(msgpack_path) + elif pkl_path.exists(): + warnings.warn( + f"Loading tokenizer from legacy pickle format at {pkl_path}. " + "Re-save this tokenizer to migrate to the msgpack format.", + DeprecationWarning, + stacklevel=2, + ) + with open(pkl_path, "rb") as f: + mergeable_ranks = pickle.load(f) + else: + raise FileNotFoundError( + f"No tokenizer vocab file found in {directory}. Expected {msgpack_path} or {pkl_path}." + ) + + # Validate mergeable ranks and ensure no overlap with special tokens + validate_mergeable_ranks(mergeable_ranks) + special_tokens_map = {tok: 0 for tok in config.special_tokens.list()} + validate_no_special_token_overlap(mergeable_ranks, special_tokens_map) + + log0( + f"Loaded tokenizer config from {name} with vocab size " + f"{len(mergeable_ranks) + len(config.special_tokens)}", + logger=logger, + ) return cls( mergeable_ranks=mergeable_ranks, special_tokens=config.special_tokens.list(), @@ -503,60 +575,13 @@ def from_disk(cls, name: str, cachedir: Optional[Union[str, Path]] = None): @classmethod def truncated_from_pretrained(cls, name: str, new_vocab_size: int, source: str = "tiktoken") -> Tokenizer: - """Construct a tokenizer by truncating the merge-rank table - of a pretrained tokenizer. - - The tokenizer retains: - - all primitive byte tokens - - the earliest merge rules - - all special tokens + """Delegate truncation to tokenizer.truncation.truncated_from_pretrained (Phase 2). - and removes later BPE merges according to merge rank order. - - Because BPE merge ranks are constructed incrementally, - keeping the first K mergeable ranks preserves a valid - prefix of the original tokenizer. + Signature preserved for backward compatibility. """ - new_name = f"{name}_truncated_{new_vocab_size}" + from gpt_lab.tokenizer.truncation import truncated_from_pretrained as _trunc - try: - tokenizer = cls.from_disk(new_name) - log0(f"Found existing truncated tokenizer {new_name} on disk, loading it instead of creating a new one.", logger=logger) - return tokenizer - except Exception as e: - log0(f"No existing truncated tokenizer found on disk for {new_name}. Creating a new one by truncating the pretrained tokenizer {name}. Error: {e}", logger=logger, level="warning") - - tokenizer = cls.from_pretrained(name, source=source) - if new_vocab_size - len(tokenizer.special_tokens) < 256: - msg = f"New vocab size {new_vocab_size} is too small to fit all single byte tokens and special tokens. Minimum vocab size is {256 + len(tokenizer.special_tokens)}." - log_error(msg, error_type=ValueError, logger=logger) - - if new_vocab_size >= tokenizer.vocab_size: - return tokenizer - - merges_to_remove = tokenizer.vocab_size - new_vocab_size - mergeable_ranks = dict(tokenizer.mergeable_ranks) # shallow copy, don't mutate original - for merge, _ in sorted(mergeable_ranks.items(), key=lambda x: x[1], reverse=True): - if merges_to_remove <= 0: - break - del mergeable_ranks[merge] - merges_to_remove -= 1 - - config = TokenizerConfig( - name=new_name, - source=tokenizer.config.source, - dirname=tokenizer.config.dirname.parent / new_name, - vocab_size=new_vocab_size, - pat_str=tokenizer.config.pat_str, - special_tokens=tokenizer.config.special_tokens - ) - tokenizer = cls( - mergeable_ranks=mergeable_ranks, - special_tokens=config.special_tokens.list(), - config=config, - ) - tokenizer.update_token_bytes() - return tokenizer + return _trunc(name, new_vocab_size, source=source) @classmethod def get_closest_truncated_from_pretrained(cls, tokenizer: Tokenizer, target_vocab_size: int) -> Tokenizer: @@ -568,20 +593,26 @@ def get_closest_truncated_from_pretrained(cls, tokenizer: Tokenizer, target_voca return cls.truncated_from_pretrained(tokenizer.config.name, closest_vocab_size) def update_token_bytes(self): - if not hasattr(self, 'token_bytes'): - self.token_bytes = self.get_token_bytes() - if len(self.token_bytes) == len(self.mergeable_ranks) + len(self.special_tokens): - log0("No merges were removed, token bytes remain the same.", logger=logger) - return - old_vocab_size = len(self.token_bytes) - self.token_bytes = self.get_token_bytes() - self.token_bytes = torch.cat([ - self.token_bytes[:len(self.mergeable_ranks)], # surviving merges, byte counts intact - torch.zeros(len(self.special_tokens), dtype=torch.int32) # special tokens always 0 - ]) + # Recompute token_bytes directly from mergeable_ranks keys to avoid decode()/string roundtrip + if not hasattr(self, "mergeable_ranks") or self.mergeable_ranks is None: + raise RuntimeError("mergeable_ranks is missing when updating token bytes") + + sorted_items = sorted(self.mergeable_ranks.items(), key=lambda x: x[1]) + token_bytes_list = [len(token) for token, _ in sorted_items] + token_bytes_list.extend([0] * len(self.special_tokens)) + new_token_bytes = torch.tensor(token_bytes_list, dtype=torch.int32, device="cpu") + + old_vocab_size = getattr(self, "token_bytes", torch.tensor([])).numel() if hasattr(self, "token_bytes") else 0 + self.token_bytes = new_token_bytes log0(f"Updated token bytes after truncation from {old_vocab_size:,} to {self.vocab_size:,}", logger=logger) - # Recompute token bytes and save to disk, useful after truncating the vocab - self.config.save_to_directory() + # Save token_bytes to disk + token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" + torch.save(self.token_bytes, token_bytes_path) + # Persist tokenizer config/metadata + try: + self.config.save_to_directory() + except Exception: + log0("Warning: failed to save config.pkl after truncation; token_bytes saved.", level="warning", logger=logger) def save_to_directory(self, directory: Optional[Union[str, Path]] = None): # Save the tokenizer's merges and vocab to the specified directory @@ -596,12 +627,36 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): # which is essentially the vocab of the tokenizer. # We don't have a separate merges dict since the mergeable ranks already # encodes the merges in the order they were added during training. - self.config.save_to_directory() + # Persist mergeable ranks in deterministic msgpack format and compute fingerprint + msgpack_path = directory / "mergeable_ranks.msgpack" + fingerprint = save_mergeable_ranks(msgpack_path, self.mergeable_ranks) + + # Save token bytes tensor + token_bytes_path = directory / "token_bytes.pt" + if getattr(self, "token_bytes", None) is None: + self.token_bytes = self.get_token_bytes() + torch.save(self.token_bytes, token_bytes_path) + + # Write a lightweight JSON descriptor alongside the pickle config for readability + config_json = { + "name": self.config.name, + "vocab_size": self.config.vocab_size, + "pat_str": self.config.pat_str, + "mergeable_ranks_sha256": fingerprint, + "source": self.config.source, + } + json_path = directory / "tokenizer_config.json" + with open(json_path, "w") as jf: + _json.dump(config_json, jf, indent=2) + + # Keep legacy pickle-based config for backwards compatibility + try: + self.config.save_to_directory() + except Exception: + # If saving the pickle-config fails, still keep msgpack and json + log0("Warning: failed to save config.pkl; msgpack and json were written.", level="warning", logger=logger) - vocab_path = directory / "vocab.pkl" - with open(vocab_path, "wb") as vf: - pickle.dump(self.mergeable_ranks, vf) - log0(f"Saved tokenizer vocab to {vocab_path}", logger=logger) + log0(f"Saved tokenizer mergeable ranks to {msgpack_path}", logger=logger) def encode_special(self, token: str) -> int: return self.special_tokens[token] diff --git a/src/gpt_lab/tokenizer/truncation.py b/src/gpt_lab/tokenizer/truncation.py new file mode 100644 index 0000000..77f4b03 --- /dev/null +++ b/src/gpt_lab/tokenizer/truncation.py @@ -0,0 +1,123 @@ +""" +Phase 2 (truncation): deterministic truncation utilities. + +Introduced in Phase 2 of the tokenizer refactor. This module exposes +`parse_truncated_name` and `truncated_from_pretrained` to perform safe, +deterministic tokenizer truncation while preserving byte tokens and +reassigning contiguous ranks. +""" +from __future__ import annotations + +import re +from typing import Optional +from pathlib import Path + +# Lightweight module: avoid importing heavy project modules at import time. +# Logging is optional; use print() for informational messages here. + +TRUNCATED_PATTERN = re.compile(r"^(?P.+)_truncated_(?P\d+)$") + + +def parse_truncated_name(name: str) -> Optional[tuple[str, int]]: + """Parse a truncated tokenizer name like 'foo_truncated_30000'. + + Returns (base_name, vocab_size) or None if the pattern doesn't match. + """ + match = TRUNCATED_PATTERN.match(name) + if match is None: + return None + return match.group("base"), int(match.group("vocab")) + + +def truncated_from_pretrained(base_name: str, new_vocab_size: int, source: str = "tiktoken", special_tokens: Optional[SpecialTokens] = None): + """Create a truncated tokenizer keeping the first K mergeable ranks. + + This function is deterministic, non-mutating to the source tokenizer, + reassigns ranks to be contiguous from 0..K-1, and enforces that all + 256 primitive byte tokens are retained. + """ + # Local import to avoid circular dependencies at module import time + from gpt_lab.tokenizer.tokenizer import Tokenizer + + # Local imports to avoid pulling heavy dependencies at module import time + from gpt_lab.utils.schemas import TokenizerConfig + from gpt_lab.utils.special_tokens import SpecialTokens + + if special_tokens is None: + special_tokens = SpecialTokens() + + new_name = f"{base_name}_truncated_{new_vocab_size}" + + # If a truncated tokenizer already exists on disk, prefer loading it + try: + return Tokenizer.from_disk(new_name) + except FileNotFoundError: + print(f"No existing truncated tokenizer found for {new_name}; creating new one.") + + # Load base tokenizer + base_tok = Tokenizer.from_pretrained(base_name, source=source, special_tokens=special_tokens) + + n_special = len(special_tokens.list()) + n_mergeable_keep = new_vocab_size - n_special + + N_BYTE_TOKENS = 256 + if n_mergeable_keep < N_BYTE_TOKENS: + raise ValueError( + f"Cannot truncate to {n_mergeable_keep} mergeable ranks: must retain all {N_BYTE_TOKENS} byte-level tokens." + ) + + if new_vocab_size >= base_tok.vocab_size: + return base_tok + + # Sort mergeable ranks by original rank (ascending) and keep first K + sorted_items = sorted(base_tok.mergeable_ranks.items(), key=lambda x: x[1]) + kept = sorted_items[:n_mergeable_keep] + + # Reassign ranks contiguously from 0 + new_mergeable = {token: rank for rank, (token, _) in enumerate(kept)} + + # Build new config + config = TokenizerConfig( + name=new_name, + source=base_tok.config.source, + dirname=base_tok.config.dirname.parent / new_name, + vocab_size=new_vocab_size, + pat_str=base_tok.config.pat_str, + special_tokens=base_tok.config.special_tokens, + ) + + new_tokenizer = Tokenizer( + mergeable_ranks=new_mergeable, + special_tokens=config.special_tokens.list(), + config=config, + ) + + # Validate contiguous ranks on the created mergeable_ranks using the + # serialization validation function (loaded lazily to avoid importing + # the package and its heavy dependencies at module import time). + import importlib.util, sys + from pathlib import Path as _P + src_root = _P(__file__).resolve().parents[2] + serial_path = src_root / 'gpt_lab' / 'tokenizer' / 'serialization.py' + spec = importlib.util.spec_from_file_location('tokenizer_serial_local', str(serial_path)) + serial_mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(serial_mod) + serial_mod.validate_mergeable_ranks(new_mergeable) + + # Verify token_bytes semantics (compute from raw bytes keys) + sorted_items_new = sorted(new_mergeable.items(), key=lambda x: x[1]) + token_bytes_list = [len(token) for token, _ in sorted_items_new] + token_bytes_list.extend([0] * len(config.special_tokens.list())) + assert len(token_bytes_list) == new_vocab_size, ( + "Computed token bytes length does not match new_vocab_size" + ) + + # Persist token bytes and config via Tokenizer helper when possible + try: + new_tokenizer.update_token_bytes() + except Exception: + # If torch or disk are unavailable in this environment, we've already + # validated the in-memory invariants above. + print("Unable to call update_token_bytes() — environment may lack torch. In-memory checks passed.") + + return new_tokenizer diff --git a/uv.lock b/uv.lock index acfdec3..47faa39 100644 --- a/uv.lock +++ b/uv.lock @@ -576,6 +576,7 @@ dependencies = [ { name = "gradio" }, { name = "jinja2" }, { name = "kernels" }, + { name = "msgpack" }, { name = "numpy" }, { name = "psutil" }, { name = "pydantic" }, @@ -620,6 +621,7 @@ requires-dist = [ { name = "gradio", specifier = "==3.18.0" }, { name = "jinja2", specifier = "==3.1.6" }, { name = "kernels", specifier = "==0.11.7" }, + { name = "msgpack", specifier = ">=1.1.2" }, { name = "numpy", specifier = "==2.4.3" }, { name = "psutil", specifier = "==7.2.2" }, { name = "pydantic", specifier = "==2.12.5" }, @@ -1314,6 +1316,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, ] +[[package]] +name = "msgpack" +version = "1.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/bd/8b0d01c756203fbab65d265859749860682ccd2a59594609aeec3a144efa/msgpack-1.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:70a0dff9d1f8da25179ffcf880e10cf1aad55fdb63cd59c9a49a1b82290062aa", size = 81939, upload-time = "2025-10-08T09:15:01.472Z" }, + { url = "https://files.pythonhosted.org/packages/34/68/ba4f155f793a74c1483d4bdef136e1023f7bcba557f0db4ef3db3c665cf1/msgpack-1.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446abdd8b94b55c800ac34b102dffd2f6aa0ce643c55dfc017ad89347db3dbdb", size = 85064, upload-time = "2025-10-08T09:15:03.764Z" }, + { url = "https://files.pythonhosted.org/packages/f2/60/a064b0345fc36c4c3d2c743c82d9100c40388d77f0b48b2f04d6041dbec1/msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f", size = 417131, upload-time = "2025-10-08T09:15:05.136Z" }, + { url = "https://files.pythonhosted.org/packages/65/92/a5100f7185a800a5d29f8d14041f61475b9de465ffcc0f3b9fba606e4505/msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:372839311ccf6bdaf39b00b61288e0557916c3729529b301c52c2d88842add42", size = 427556, upload-time = "2025-10-08T09:15:06.837Z" }, + { url = "https://files.pythonhosted.org/packages/f5/87/ffe21d1bf7d9991354ad93949286f643b2bb6ddbeab66373922b44c3b8cc/msgpack-1.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2929af52106ca73fcb28576218476ffbb531a036c2adbcf54a3664de124303e9", size = 404920, upload-time = "2025-10-08T09:15:08.179Z" }, + { url = "https://files.pythonhosted.org/packages/ff/41/8543ed2b8604f7c0d89ce066f42007faac1eaa7d79a81555f206a5cdb889/msgpack-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be52a8fc79e45b0364210eef5234a7cf8d330836d0a64dfbb878efa903d84620", size = 415013, upload-time = "2025-10-08T09:15:09.83Z" }, + { url = "https://files.pythonhosted.org/packages/41/0d/2ddfaa8b7e1cee6c490d46cb0a39742b19e2481600a7a0e96537e9c22f43/msgpack-1.1.2-cp312-cp312-win32.whl", hash = "sha256:1fff3d825d7859ac888b0fbda39a42d59193543920eda9d9bea44d958a878029", size = 65096, upload-time = "2025-10-08T09:15:11.11Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ec/d431eb7941fb55a31dd6ca3404d41fbb52d99172df2e7707754488390910/msgpack-1.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1de460f0403172cff81169a30b9a92b260cb809c4cb7e2fc79ae8d0510c78b6b", size = 72708, upload-time = "2025-10-08T09:15:12.554Z" }, + { url = "https://files.pythonhosted.org/packages/c5/31/5b1a1f70eb0e87d1678e9624908f86317787b536060641d6798e3cf70ace/msgpack-1.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:be5980f3ee0e6bd44f3a9e9dea01054f175b50c3e6cdb692bc9424c0bbb8bf69", size = 64119, upload-time = "2025-10-08T09:15:13.589Z" }, +] + [[package]] name = "multidict" version = "6.7.0" From 55344134bd57d97ef4dfb7dd75196212b345cea6 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Mon, 18 May 2026 01:22:28 +0200 Subject: [PATCH 07/18] tokenizer: auto + hf + trainer config refactoring --- README.md | 104 ++++++++++ src/gpt_lab/model/auto.py | 67 +------ src/gpt_lab/tokenizer/auto.py | 113 +++++++++++ src/gpt_lab/tokenizer/base.py | 75 ++++++++ src/gpt_lab/tokenizer/bpe.py | 4 +- src/gpt_lab/tokenizer/hf.py | 132 +++++++++++++ src/gpt_lab/tokenizer/tokenizer.py | 257 +++--------------------- src/gpt_lab/utils/schemas.py | 46 ++++- tests/test_tokenizer.py | 300 ++++++++++++++++++++++++++++- 9 files changed, 799 insertions(+), 299 deletions(-) create mode 100644 src/gpt_lab/tokenizer/auto.py create mode 100644 src/gpt_lab/tokenizer/base.py create mode 100644 src/gpt_lab/tokenizer/hf.py diff --git a/README.md b/README.md index af68c81..527c75d 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,110 @@ meta_config = cfg.generate_gpt_config(device="cuda") Next sections detail the different generated components. ### Tokenization + The tokenization implementation are located in [`gpt_lab.tokenizer`](./src/gpt_lab/tokenizer/tokenizer.py). The code only includes BPE tokenization for now (include sentencepiece is a TODO). The tokenizer training is only supported by huggingface implementation for now. For inference, the tiktoken implementation is the default one, as it is much faster than the huggingface one. The custom BPE implementation is still under development, and is not functional yet. diff --git a/src/gpt_lab/model/auto.py b/src/gpt_lab/model/auto.py index 35bf840..ed435ed 100644 --- a/src/gpt_lab/model/auto.py +++ b/src/gpt_lab/model/auto.py @@ -13,7 +13,8 @@ ) from gpt_lab.utils.common import print0, print0_dict from gpt_lab.utils.logging import log0, error, log_error -from gpt_lab.tokenizer.tokenizer import get_closest_tokenizer_size, Tokenizer +from gpt_lab.tokenizer.tokenizer import Tokenizer +from gpt_lab.tokenizer.auto import compute_optimal_vocab_size as auto_compute_optimal_vocab_size, build_or_load_tokenizer, resolve_tokenizer from gpt_lab.model.gpt import DenseTransformer from gpt_lab.model.checkpoint import build_meta_model, make_default_run_name @@ -125,49 +126,9 @@ def build_meta_model_from_depth(depth: int, vocab_size: int = -1) -> DenseTransf ) return build_meta_model(config) + # Delegate computation of optimal vocab size to tokenizer.auto def compute_optimal_vocab_size(depth: int) -> int: - """ - Compute optimal vocabulary size based on scaling law from Tao et al. 2024 (https://arxiv.org/abs/2407.13623). - - This is a rough estimate and can be tuned based on experiments. The scaling law is based on the number of scaling parameters in the model, which is approximated here as - $depth * (depth * aspect_ratio) ** 2 - - If self.train_tokenizer is True: the optimal vocab size is rounded to the nearest 1000 for tokenizer cache efficiency. - Else: the optimal vocab size is set to the closest in the tokenizer cache to maximize reuse of existing tokenizers. - - Args: - depth (int): The depth of the model (number of layers). - - Returns: - int: The optimal vocabulary size for the model (including special tokens). - """ - assert (self.tokenizer_model is None) or (self.tokenizer_model == "auto") or (not self.train_tokenizer), "Tokenizer model should not be specified if train_tokenizer is True, since we will be training a new tokenizer from scratch. Please set tokenizer_model to None or 'auto'." - - if self.tokenizer_model not in (None, "auto"): - tokenizer = _get_tokenizer_pretrained(self.tokenizer_model) - return tokenizer.vocab_size # vocab size = mergeable ranks size + special tokens size - - # set vocab size based on scaling law from Tao et al. 2024 (https://arxiv.org/abs/2407.13623) - # we approximate the values from their paper with a simple scaling law for vocab size based on depth and aspect ratio - # this is a rough estimate and can be tuned based on experiments. - # we also approximate the vocab size to the closest in the tokenizer cache to maximize reuse of existing tokenizers - _mmodel = build_meta_model_from_depth(depth, vocab_size=1) - n_non_vocab_scaling_params = _mmodel.n_params # vocab size = 1, so n_params ~ Nnv - power = 0.84 - coeff = .2 / (.08 ** power) / (depth * self.aspect_ratio) - opt_vocab_size = coeff * (n_non_vocab_scaling_params ** power) # V ~ .2 / d_model * (n_scaling_params / 0.08) ^.84 - del _mmodel # free memory - print0(f"Number of non-vocabulary scaling parameters for depth {depth}: {n_non_vocab_scaling_params:.2e}") - if not self.train_tokenizer: - _, vocab_size = get_closest_tokenizer_size(opt_vocab_size) - else: - step = 10 ** (int(math.log10(opt_vocab_size)) - 1) - vocab_size = round(opt_vocab_size / step) * step # round to nearest log10 for better tokenizer cache efficiency - - if vocab_size < 256: - raise ValueError(f"Vocab size must be specified and at least 256 to ensure all unicode characters are supported. Computed optimal vocab size based on scaling law is {opt_vocab_size}, but got {self.vocab_size}. Please set vocab_size to a value >= 256.") - - return vocab_size + len(special_tokens) # add special tokens to vocab size + return auto_compute_optimal_vocab_size(depth, self.aspect_ratio, self.train_tokenizer, self.tokenizer_model, special_tokens) # TODO: for more efficient training of the model make # vocab size = ((config.vocab_size + pad_vocab_size_to - 1) // pad_vocab_size_to) * pad_vocab_size_to @@ -180,8 +141,9 @@ def compute_optimal_vocab_size(depth: int) -> int: model = build_meta_model_from_depth(self.depth, vocab_size=vocab_size) # initiate tokenizer based on vocab size and user config - if (self.tokenizer_model not in (None, "auto")) or not self.train_tokenizer: - tname = self.tokenizer_model or get_closest_tokenizer_size(vocab_size)[0] + if (self.tokenizer_model not in (None, "auto")) or not self.train_tokenizer: + # attempt to load or resolve tokenizer + tname = self.tokenizer_model or resolve_tokenizer(self.tokenizer_model, vocab_size, special_tokens) tokenizer = _get_tokenizer_pretrained(tname) else: # otherwise train tokenizer # choose a pat_str based on vocab size (method/thresholds arbitrary for now) @@ -198,22 +160,11 @@ def compute_optimal_vocab_size(depth: int) -> int: _vs = f"{vocab_size//1000:,}k" if vocab_size < 1e6 else f"{vocab_size/1_000_000:.2f}M" _tname = f"{self.name}_{_vs}" - from gpt_lab.utils.schemas import TokenizerTrainerConfig - from gpt_lab.tokenizer.corpus import TokenizerCorpus log0(f"Training new tokenizer with vocab size {vocab_size} using pattern " f"{pat_str} on corpus from {str(DATA_DIR / 'corpus' / self.name)}. This may take a while...", logger=logger, level="warning") - corpus = TokenizerCorpus.from_sources( - corpus_dir=DATA_DIR / "corpus" / self.name, - # default sources for now - max_chars=vocab_size * 4 * 100, - random_seed=self.random_seed, - ) - _tok_trainer = TokenizerTrainerConfig( - name=_tname, dirname=self.dirname, vocab_size=int(vocab_size), - pat_str=PAT_STR.get(pat_str, "gpt2"), special_tokens=special_tokens - ) - tokenizer = Tokenizer.train_from_iterator(_tok_trainer, iterator=corpus.iterator()) + + tokenizer = build_or_load_tokenizer(self.tokenizer_model, int(vocab_size), True, _tname, PAT_STR.get(pat_str, "gpt2"), special_tokens, DATA_DIR / "corpus" / self.name, self.random_seed, dirname=self.dirname) param_counts = model.n_params_per_layer() diff --git a/src/gpt_lab/tokenizer/auto.py b/src/gpt_lab/tokenizer/auto.py new file mode 100644 index 0000000..4ecb7f9 --- /dev/null +++ b/src/gpt_lab/tokenizer/auto.py @@ -0,0 +1,113 @@ +""" +Phase 4 (auto): tokenizer orchestration helpers. + +Introduced in Phase 4 of the tokenizer refactor. This module centralizes +tokenizer selection, optimal-vocab computations and build-or-load +orchestration so `AutoGPTConfig` can remain thin and model-config driven. + +Note: This module may import from model config utilities (scaling laws +require architecture information). Model config MUST NOT import from +this module to avoid cycles; the dependency is one-way. +""" +from __future__ import annotations + +import math +from typing import Optional + +from gpt_lab.tokenizer.tokenizer import Tokenizer, get_closest_tokenizer_size +from gpt_lab.utils.special_tokens import SpecialTokens +from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig +from gpt_lab.utils.logging import log0, log_error + + +def compute_optimal_vocab_size(depth: int, aspect_ratio: int, train_tokenizer: bool, tokenizer_model: Optional[str], special_tokens: SpecialTokens, get_closest=get_closest_tokenizer_size) -> int: + """Compute optimal vocab size using the project's scaling-law approximation. + + Returns the vocabulary size including special tokens. + """ + # Build a small helper to create the meta model for the depth + def build_meta_model_from_depth(d: int, vocab_size: int = -1): + # Import here to avoid circular dependency with gpt_lab.model.auto + from gpt_lab.utils.schemas import TransformerConfig + from gpt_lab.model.checkpoint import build_meta_model + + config = TransformerConfig( + tf_type="dense", + vocab_size=vocab_size, + max_context=2048, + d_model=(d * aspect_ratio), + d_ffn=4 * (d * aspect_ratio), + n_layers=d, + n_heads=1, + d_head=1, + ) + return build_meta_model(config) + + assert (tokenizer_model is None) or (tokenizer_model == "auto") or (not train_tokenizer) + + if tokenizer_model not in (None, "auto"): + tokenizer = Tokenizer.from_pretrained(tokenizer_model) + return tokenizer.vocab_size + + _mmodel = build_meta_model_from_depth(depth, vocab_size=1) + n_non_vocab_scaling_params = _mmodel.n_params + power = 0.84 + coeff = .2 / (.08 ** power) / (depth * aspect_ratio) + opt_vocab_size = coeff * (n_non_vocab_scaling_params ** power) + del _mmodel + log0(f"Number of non-vocabulary scaling parameters for depth {depth}: {n_non_vocab_scaling_params:.2e}") + + if not train_tokenizer: + _, vocab_size = get_closest(opt_vocab_size) + else: + step = 10 ** (int(math.log10(opt_vocab_size)) - 1) + vocab_size = round(opt_vocab_size / step) * step + + if vocab_size < 256: + raise ValueError("Computed optimal vocab size is <256; increase model size or set vocab_size explicitly.") + + return int(vocab_size) + len(special_tokens.list()) + + +def resolve_tokenizer(name: Optional[str], vocab_size: int, special_tokens: SpecialTokens) -> str: + """Return a tokenizer name to use given an explicit name or a vocab size. + + If `name` is provided and not 'auto', return it. Otherwise choose the + closest cached tokenizer name for `vocab_size`. + """ + if name not in (None, "auto"): + return name + return get_closest_tokenizer_size(vocab_size)[0] + + +def build_or_load_tokenizer(tname: Optional[str], vocab_size: int, train_tokenizer: bool, base_name: str, pat_str: str, special_tokens: SpecialTokens, data_dir, random_seed: int, dirname=None): + """Orchestrate loading or training of a tokenizer. + + - If `not train_tokenizer`, attempt to load a pretrained tokenizer. + - Else, train a new tokenizer using the corpus and `TokenizerTrainerConfig`. + Returns a `Tokenizer` instance. + """ + if not train_tokenizer: + name_or_choice = tname or resolve_tokenizer(tname, vocab_size, special_tokens) + try: + return Tokenizer.from_pretrained(name_or_choice) + except Exception as e: + log0(f"Error loading tokenizer {name_or_choice}: {e}", level="warning") + # Try to construct from config/disk + try: + cfg = TokenizerConfig(name=name_or_choice, source="tiktoken", vocab_size=vocab_size, special_tokens=special_tokens, pat_str=pat_str) + return Tokenizer.from_config(cfg) + except Exception as e2: + log0(f"Fallback to local tokenizer load failed: {e2}", level="warning") + cfg2 = TokenizerConfig.from_directory(name_or_choice) + mergeable = cfg2.get_mergeable_ranks() + return Tokenizer(mergeable_ranks=mergeable, special_tokens=special_tokens.list(), config=cfg2) + + # Train a new tokenizer + from gpt_lab.tokenizer.corpus import TokenizerCorpus + + _tname = base_name + trainer_cfg = TokenizerTrainerConfig(name=_tname, dirname=dirname or base_name, vocab_size=vocab_size, pat_str=pat_str, special_tokens=special_tokens) + corpus = TokenizerCorpus.from_sources(corpus_dir=data_dir, max_chars=vocab_size * 4 * 100, random_seed=random_seed) + tokenizer = Tokenizer.train_from_iterator(text_iterator=corpus.iterator(), config=trainer_cfg) + return tokenizer diff --git a/src/gpt_lab/tokenizer/base.py b/src/gpt_lab/tokenizer/base.py new file mode 100644 index 0000000..09e35bc --- /dev/null +++ b/src/gpt_lab/tokenizer/base.py @@ -0,0 +1,75 @@ +from gpt_lab.utils.schemas import TokenizerConfig +from gpt_lab.utils.logging import log0 +from pathlib import Path +from typing import Optional +import torch +import logging + +logger = logging.getLogger(__name__) +# ------------------------------------------------------------ +# BASE TOKENIZER INTERFACE (for consistency) +# ------------------------------------------------------------ + +class _BaseTokenizer: + """Base tokenizer class defining the common interface for all tokenizers. + + This class is not meant to be used directly, but rather to be inherited by specific tokenizer implementations. + + Should implement the following methods: + - encode: to convert text to token ids + - decode: to convert token ids back to text + - encode_special: to encode special tokens to their corresponding ids""" + def __init__(self, config: Optional[TokenizerConfig] = None): + self.config = config + self.special_tokens = None + self.mergeable_ranks = None + try: + self.token_bytes = self.get_token_bytes() + except Exception as e: + log0(f"Failed to get token bytes during initialization: {e}. " \ + f"This may cause issues with optimizers that rely on token byte lengths. "\ + "You can try calling get_token_bytes() manually after initialization to see the full error message and debug the issue.", + level="warning", logger=logger) + + def get_vocab(self): + return {**self.mergeable_ranks, **self.special_tokens} + + @property + def vocab_size(self): + "vocab_size value icludes both mergeable ranks and special tokens" + return len(self.mergeable_ranks) + len(self.special_tokens) + + @property + def n_special_tokens(self): + return len(self.special_tokens) + + @property + def n_ranks(self): + return len(self.mergeable_ranks) + + def get_token_bytes(self): + token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" + if getattr(self, "token_bytes", None) is not None: + return self.token_bytes + + if token_bytes_path.exists(): + token_bytes = torch.load(token_bytes_path) + log0(f"Loaded token_bytes from {token_bytes_path}", logger=logger) + else: + # Compute byte lengths directly from mergeable_ranks keys (which are bytes) + mergeable = self.mergeable_ranks or {} + # Sort by rank to produce deterministic ordering + sorted_items = sorted(mergeable.items(), key=lambda x: x[1]) + token_bytes_list = [len(token) for token, _ in sorted_items] + # Special tokens are always zero-length for token_bytes + token_bytes_list.extend([0] * len(self.special_tokens)) + token_bytes = torch.tensor(token_bytes_list, dtype=torch.int32, device="cpu") + with open(token_bytes_path, "wb") as f: + torch.save(token_bytes, f) + log0(f"Saved token_bytes to {token_bytes_path}", logger=logger) + + self.token_bytes = token_bytes + return token_bytes + + def __call__(self, text, *args, **kwds): + return self.encode(text, *args, **kwds) \ No newline at end of file diff --git a/src/gpt_lab/tokenizer/bpe.py b/src/gpt_lab/tokenizer/bpe.py index aa3410e..26cb0ed 100644 --- a/src/gpt_lab/tokenizer/bpe.py +++ b/src/gpt_lab/tokenizer/bpe.py @@ -165,7 +165,9 @@ def bpe_fast(corpus_iter_fn, corpus_path, config): pair_locs = defaultdict(list) pretknzr = SimplePreTokenizer(config) - for head in tqdm(iter_word_nodes(corpus_iter_fn, corpus_path, pretknzr), desc="Building initial pair stats", total=config.max_chars // 10_000): + tp = getattr(config, "training_params", None) + max_chars_for_progress = tp.max_chars if tp is not None else config.max_chars + for head in tqdm(iter_word_nodes(corpus_iter_fn, corpus_path, pretknzr), desc="Building initial pair stats", total=max_chars_for_progress // 10_000): all_words.append(head) for node in collect_pairs(head): pair = (node.sym, node.next.sym) diff --git a/src/gpt_lab/tokenizer/hf.py b/src/gpt_lab/tokenizer/hf.py new file mode 100644 index 0000000..a867dc4 --- /dev/null +++ b/src/gpt_lab/tokenizer/hf.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +import os, json +from pathlib import Path +from typing import Iterable, Dict + +from gpt_lab.utils.logging import log_all +from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig +from gpt_lab.utils.special_tokens import SpecialTokens +from gpt_lab.tokenizer.base import _BaseTokenizer + +try: + from tokenizers import Tokenizer as HFTokenizer + from tokenizers import decoders, pre_tokenizers, Regex + from tokenizers.models import BPE + from tokenizers.trainers import BpeTrainer +except Exception: + HFTokenizer = None +import logging + +logger = logging.getLogger(__name__) + +class HuggingFaceTokenizerWrapper(_BaseTokenizer): + """Light wrapper around HuggingFace `tokenizers` tokenizer. + + Provides a compatible subset of the previous wrapper used by + Tokenizer.from_pretrained and Tokenizer.from_directory. + """ + def __init__(self, tokenizer, config: TokenizerConfig): + self.main = tokenizer + self.config = config + + @property + def special_tokens(self): + special_tokens_map = self.main.get_added_tokens_decoder() + special_tokens = [w.content for w in special_tokens_map.values()] + return special_tokens + + @classmethod + def from_pretrained(cls, hf_path: str): + if HFTokenizer is None: + log_all("tokenizers library not available, cannot load HuggingFace tokenizer", level="error", logger=logger) + tokenizer = HFTokenizer.from_pretrained(hf_path) + config = TokenizerConfig( + name=hf_path, + source="huggingface", + vocab_size=tokenizer.get_vocab_size(), + pat_str=None, + special_tokens=SpecialTokens(), + ) + return cls(tokenizer, config=config) + + @classmethod + def from_directory(cls, tokenizer_dir: str): + if HFTokenizer is None: + log_all("tokenizers library not available, cannot load HuggingFace tokenizer", level="error", logger=logger) + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + tokenizer = HFTokenizer.from_file(tokenizer_path) + config = TokenizerConfig( + name=tokenizer_dir, + source="local", + vocab_size=tokenizer.get_vocab_size(), + pat_str=None, + special_tokens=SpecialTokens(), + ) + return cls(tokenizer, config=config) + + def id_to_token(self, id): + return self.main.id_to_token(id) + + def encode(self, text, add_special_tokens=False): + return self.main.encode(text, add_special_tokens=add_special_tokens).ids + + def decode(self, ids): + return self.main.decode(ids, skip_special_tokens=False) + + def save(self, tokenizer_dir: str): + os.makedirs(tokenizer_dir, exist_ok=True) + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + self.main.save(tokenizer_path) + + +def train_huggingface_from_iterator(text_iterator: Iterable[str], config: TokenizerTrainerConfig) -> Dict[bytes, int]: + """Train a HuggingFace BPE tokenizer and return mergeable_ranks mapping. + + Returns a dict mapping byte-strings to ranks (integers). + """ + if HFTokenizer is None: + log_all("tokenizers library is required for HuggingFace trainer", level="error", logger=logger) + + tknzr = HFTokenizer( + BPE( + byte_fallback=True, + unk_token=None, + fuse_unk=False + ) + ) + tknzr.normalizer = None + pattern = Regex(config.pat_str) + tknzr.pre_tokenizer = pre_tokenizers.Sequence([ + pre_tokenizers.Split(pattern=pattern, behavior="isolated", invert=False), + pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) + ]) + tknzr.decoder = decoders.ByteLevel() + tknzr.post_processor = None + initial_alphabet = pre_tokenizers.ByteLevel.alphabet() + + # Prefer training-specific params container when available + tp = getattr(config, "training_params", None) + vocab_size_no_special = config.vocab_size - len(config.special_tokens.list()) + trainer = BpeTrainer( + vocab_size=vocab_size_no_special, + show_progress=True, + min_frequency=0, + initial_alphabet=initial_alphabet, + special_tokens=[], + ) + trainer.show_progress = tp.show_progress if tp is not None else config.show_progress + tknzr.train_from_iterator(iterator=text_iterator, trainer=trainer) + + merges = json.loads(tknzr.to_str())["model"]["merges"] + + def merge_to_bytes(merge): + left, right = merge + left = left.replace("Ġ", " ") + right = right.replace("Ġ", " ") + return left.encode("utf-8") + right.encode("utf-8") + + mergeable_ranks = {merge_to_bytes(merge): rank + 256 for rank, merge in enumerate(merges)} + # Add single-byte tokens + mergeable_ranks.update({bytes([i]): i for i in range(256) if bytes([i]) not in mergeable_ranks}) + return mergeable_ranks diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 228f5c8..e209dd9 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -1,12 +1,13 @@ from __future__ import annotations import torch -import random, json, os, csv +import random, csv import pickle import warnings import json as _json -from hashlib import sha256 + +from gpt_lab.tokenizer.base import _BaseTokenizer from gpt_lab.tokenizer.serialization import ( save_mergeable_ranks, load_mergeable_ranks, @@ -21,7 +22,7 @@ from gpt_lab.utils.logging import log0, log_error import tiktoken -from tokenizers import Tokenizer as HFTokenizer +from gpt_lab.tokenizer.hf import HuggingFaceTokenizerWrapper, train_huggingface_from_iterator from typing import Callable, Iterable, List, Optional, Union, Tuple, Dict import logging @@ -80,74 +81,6 @@ def get_closest_tokenizer_size(vocab_size: int) -> Tuple[str, int]: def build_tokenizer(config: TokenizerConfig) -> Callable: return Tokenizer.from_config(config) -# ------------------------------------------------------------ -# BASE TOKENIZER INTERFACE (for consistency) -# ------------------------------------------------------------ - -class _BaseTokenizer: - """Base tokenizer class defining the common interface for all tokenizers. - - This class is not meant to be used directly, but rather to be inherited by specific tokenizer implementations. - - Should implement the following methods: - - encode: to convert text to token ids - - decode: to convert token ids back to text - - encode_special: to encode special tokens to their corresponding ids""" - def __init__(self, config: Optional[TokenizerConfig] = None): - self.config = config - self.special_tokens = None - self.mergeable_ranks = None - try: - self.token_bytes = self.get_token_bytes() - except Exception as e: - log0(f"Failed to get token bytes during initialization: {e}. " \ - f"This may cause issues with optimizers that rely on token byte lengths. "\ - "You can try calling get_token_bytes() manually after initialization to see the full error message and debug the issue.", - level="warning", logger=logger) - - def get_vocab(self): - return {**self.mergeable_ranks, **self.special_tokens} - - @property - def vocab_size(self): - "vocab_size value icludes both mergeable ranks and special tokens" - return len(self.mergeable_ranks) + len(self.special_tokens) - - @property - def n_special_tokens(self): - return len(self.special_tokens) - - @property - def n_ranks(self): - return len(self.mergeable_ranks) - - def get_token_bytes(self): - token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" - if getattr(self, "token_bytes", None) is not None: - return self.token_bytes - - if token_bytes_path.exists(): - token_bytes = torch.load(token_bytes_path) - log0(f"Loaded token_bytes from {token_bytes_path}", logger=logger) - else: - # Compute byte lengths directly from mergeable_ranks keys (which are bytes) - mergeable = self.mergeable_ranks or {} - # Sort by rank to produce deterministic ordering - sorted_items = sorted(mergeable.items(), key=lambda x: x[1]) - token_bytes_list = [len(token) for token, _ in sorted_items] - # Special tokens are always zero-length for token_bytes - token_bytes_list.extend([0] * len(self.special_tokens)) - token_bytes = torch.tensor(token_bytes_list, dtype=torch.int32, device="cpu") - with open(token_bytes_path, "wb") as f: - torch.save(token_bytes, f) - log0(f"Saved token_bytes to {token_bytes_path}", logger=logger) - - self.token_bytes = token_bytes - return token_bytes - - def __call__(self, text, *args, **kwds): - return self.encode(text, *args, **kwds) - # ------------------------------------------------------------ # DUMMY TOKENIZER INSTANCE (for quick tests/dev) # ------------------------------------------------------------ @@ -185,105 +118,6 @@ def encode(self, text, *args, **kwargs): def decode(self, tokens, *args, **kwargs): return "".join([chr(t) for t in tokens]) -# ------------------------------------------------------------ -# HUGGINGFACE TOKENIZER WRAPPER (for some utilities) -# ------------------------------------------------------------ - -class HuggingFaceTokenizerWrapper(_BaseTokenizer): - """Light wrapper around HuggingFace Tokenizer for some utilities""" - - def __init__(self, tokenizer: HFTokenizer, config: TokenizerConfig): - super().__init__(config) - self.main = tokenizer - - @property - def special_tokens(self): - special_tokens_map = self.main.get_added_tokens_decoder() - special_tokens = [w.content for w in special_tokens_map.values()] - return special_tokens - - @classmethod - def from_pretrained(cls, hf_path): - # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") - tokenizer = HFTokenizer.from_pretrained(hf_path) - config = TokenizerConfig( - name=hf_path, - source="huggingface", - vocab_size=tokenizer.get_vocab_size(), - pat_str=None, # TODO: extract pattern from HuggingFace tokenizer if possible, otherwise use a default one - special_tokens=tokenizer.get_added_tokens_decoder() - ) - return cls(tokenizer, config=config) - - @classmethod - def from_directory(cls, tokenizer_dir): - # init from a local directory on disk (e.g. "out/tokenizer") - tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") - tokenizer = HFTokenizer.from_file(tokenizer_path) - config = TokenizerConfig( - name=tokenizer_dir, - source="local", - vocab_size=tokenizer.get_vocab_size(), - pat_str=None, - special_tokens=SpecialTokens(), # tokenizer.get_added_tokens_decoder() - ) - return cls(tokenizer, config=config) - - def id_to_token(self, id): - return self.main.id_to_token(id) - - def _encode_one(self, text, prepend=None, append=None, num_threads=None): - # encode a single string - # prepend/append can be either a string of a special token or a token id directly. - # num_threads is ignored (only used by the nanochat Tokenizer for parallel encoding) - assert isinstance(text, str) - ids = [] - if prepend is not None: - prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend) - ids.append(prepend_id) - ids.extend(self.main.encode(text, add_special_tokens=False).ids) - if append is not None: - append_id = append if isinstance(append, int) else self.encode_special(append) - ids.append(append_id) - return ids - - def encode_special(self, text): - # encode a single special token via exact match - return self.main.mergeable_ranks(text) - - def get_bos_token_id(self): - # Different HuggingFace models use different BOS tokens and there is little consistency - # 1) attempt to find a <|bos|> token - bos = self.encode_special("<|bos|>") - # 2) if that fails, attempt to find a <|endoftext|> token (e.g. GPT-2 models) - if bos is None: - bos = self.encode_special("<|endoftext|>") - # 3) if these fail, it's better to crash than to silently return None - assert bos is not None, "Failed to find BOS token in tokenizer" - return bos - - def encode_ordinary(self, text, *args, **kwargs): - # encode a single string without adding special tokens - return self._encode_one(text, *args, **kwargs) - - def encode(self, text, *args, **kwargs): - if isinstance(text, str): - return self._encode_one(text, *args, **kwargs) - elif isinstance(text, list): - return [self._encode_one(t, *args, **kwargs) for t in text] - else: - raise ValueError(f"Invalid input type: {type(text)}") - - def decode(self, ids): - return self.main.decode(ids, skip_special_tokens=False) - - def save(self, tokenizer_dir): - # save the tokenizer to disk - os.makedirs(tokenizer_dir, exist_ok=True) - tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") - self.main.save(tokenizer_path) - print(f"Saved tokenizer to {tokenizer_path}") - # ------------------------------------------------------------ # MAIN TOKENIZER CLASS # - train with huggingface, @@ -439,92 +273,53 @@ def train_from_iterator( special_tokens = config.special_tokens.list() vocab_size_no_special = config.vocab_size - len(special_tokens) # TODO: make the other tokenizers for comparison; lines +1 and +2 below are temporary - if not config.trainer == "huggingface": - msg = f"Training tokenizer with trainer {config.trainer!r} is not implemented yet. Please use 'huggingface' trainer for now." + tp = getattr(config, "training_params", None) + if tp is None: + # Legacy fallback + tp_trainer = config.trainer + else: + tp_trainer = tp.trainer + + if tp_trainer != "huggingface": + msg = f"Training tokenizer with trainer {tp_trainer!r} is not implemented yet. Please use 'huggingface' trainer for now." log_error(msg, error_type=NotImplementedError, logger=logger) # TODO: make pretokenizer here -> options: 1. gpt2, 2. custom - if config.trainer == "tiktoken": + if tp_trainer == "tiktoken": from tiktoken._educational import bpe_train log0("Training tokenizer with tiktoken is a TODO for future improvement.", level="warning", logger=logger) # TODO: WIP, not tested yet mergeable_ranks = bpe_train(data=text_iterator, vocab_size=vocab_size_no_special, pat_str=config.pat_str) - elif config.trainer == "huggingface": - from tokenizers import decoders, pre_tokenizers, Regex - from tokenizers.models import BPE - from tokenizers.trainers import BpeTrainer - - tknzr = HFTokenizer( - BPE( - byte_fallback=True, - unk_token=None, - fuse_unk=False - )) - tknzr.normalizer = None - pattern = Regex(config.pat_str) - tknzr.pre_tokenizer = pre_tokenizers.Sequence([ - pre_tokenizers.Split(pattern=pattern, behavior="isolated", invert=False), - pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) - ]) - tknzr.decoder = decoders.ByteLevel() - tknzr.post_processor = None - initial_alphabet = pre_tokenizers.ByteLevel.alphabet() - - trainer = BpeTrainer( - vocab_size=vocab_size_no_special, - show_progress=True, - min_frequency=0, - initial_alphabet=initial_alphabet, - special_tokens=[] - ) - trainer.show_progress = config.show_progress - tknzr.train_from_iterator(iterator=text_iterator, trainer=trainer) - - # os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "0" - # print("Tokenizer state", tknzr.model.__getstate__().keys()) - merges = json.loads(tknzr.to_str())["model"]["merges"] - def merge_to_bytes(merge): - left, right = merge - # Handle the special case of the space token, - # which is represented as "Ġ" in the HuggingFace tokenizer - left = left.replace("Ġ", " ") - right = right.replace("Ġ", " ") - return left.encode("utf-8") + right.encode("utf-8") - mergeable_ranks = { - merge_to_bytes(merge): rank + 256 - for rank, merge in enumerate(merges) - } - # mergeable_ranks = { - # left.encode("utf-8") + right.encode("utf-8"): rank + 256 - # for rank, (left, right) in enumerate(merges) - # } - mergeable_ranks.update({ bytes([i]): i for i in range(256) if i not in mergeable_ranks }) # Add single byte tokens to mergeable ranks + elif tp_trainer == "huggingface": + # Delegate HuggingFace training logic to tokenizer.hf module + mergeable_ranks = train_huggingface_from_iterator(text_iterator, config) # TODO: add other trainer options (bpe, rust bpe, fast bpe...) # The following options are placeholders for future impl. - elif config.trainer in ["bpe", "fbpe", "rbpe"]: - raise NotImplementedError(f"Tokenizer training mode {config.trainer!r} is not yet implemented. Please use 'huggingface' mode.") - elif config.trainer == "bpe": + elif tp_trainer in ["bpe", "fbpe", "rbpe"]: + raise NotImplementedError(f"Tokenizer training mode {tp_trainer!r} is not yet implemented. Please use 'huggingface' mode.") + elif tp_trainer == "bpe": # naive python implementation of byte-level BPE, not optimized for large corpora, but serves as a reference from gpt_lab.tokenizer.bpe import bpe _, mergeable_ranks = bpe() - elif config.trainer == "fbpe": + elif tp_trainer == "fbpe": from gpt_lab.tokenizer.bpe import bpe_fast trainer = ... - elif config.trainer == "rbpe": + elif tp_trainer == "rbpe": from rbpe import bpe ... - elif config.trainer == "dummy": + elif tp_trainer == "dummy": log0("Using DummyTokenizer for training, this is not a real tokenizer and should only be used for testing purposes.", level="warning", logger=logger) return cls(DummyTokenizer(config), config) else: - msg = f"Tokenizer trainer {config.trainer!r} is not supported." + msg = f"Tokenizer trainer {tp_trainer!r} is not supported." log_error(msg, error_type=NotImplementedError, logger=logger) tokenizer = cls( mergeable_ranks=mergeable_ranks, special_tokens=special_tokens, config=config ) - if config.to_save: + to_save_flag = tp.to_save if tp is not None else getattr(config, "to_save", True) + if to_save_flag: tokenizer.save_to_directory() return tokenizer diff --git a/src/gpt_lab/utils/schemas.py b/src/gpt_lab/utils/schemas.py index 4bbd93a..0eda4fb 100644 --- a/src/gpt_lab/utils/schemas.py +++ b/src/gpt_lab/utils/schemas.py @@ -160,10 +160,33 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): pickle.dump(self, f) +class TokenizerTrainingParams(BaseModel): + """Encapsulate training-related parameters for tokenizer training. + + Phase 6: move training-specific params into a dedicated model to cleanly + separate tokenizer metadata from training-run configuration. + """ + max_chars: int = -1 + chars_per_doc: int = -1 + merges_per_pass: int = 512 + num_proc: int = -1 + trainer: Literal["tiktoken", "huggingface", "bpe", "fbpe", "rbpe", "dummy"] = "huggingface" + show_progress: bool = True + to_save: bool = True + + def model_post_init(self, context: Any) -> None: + # leave defaults; some values will be adjusted by TokenizerTrainerConfig + pass + + class TokenizerTrainerConfig(TokenizerConfig): model_config = ConfigDict( json_encoders={Path: str}, ) + # Backwards-compatible placement of training params. New code should use + # `training_params` to access training-related options. + training_params: TokenizerTrainingParams = Field(default_factory=TokenizerTrainingParams) + # Keep legacy fields for compatibility; they'll be synced into training_params max_chars: int = -1 chars_per_doc: int = -1 merges_per_pass: int = 512 # Only used for fbpe @@ -174,16 +197,25 @@ class TokenizerTrainerConfig(TokenizerConfig): def model_post_init(self, context: Any) -> None: super().model_post_init(context) - if self.trainer == "tiktoken" and self.pat_str == "": + # Sync legacy fields into the new `training_params` container + self.training_params.max_chars = self.max_chars + self.training_params.chars_per_doc = self.chars_per_doc + self.training_params.merges_per_pass = self.merges_per_pass + self.training_params.num_proc = self.num_proc + self.training_params.trainer = self.trainer + self.training_params.show_progress = self.show_progress + self.training_params.to_save = self.to_save + + if self.training_params.trainer == "tiktoken" and self.pat_str == "": log0("Using tiktoken trainer with an empty pat_str may lead to suboptimal tokenization. " "Consider using a regex pattern for better tokenization performance.", level="warning", logger=logger) - if self.max_chars == -1: - self.max_chars = int(self.vocab_size * 1000 * 2.5) # ~3.5 characters per token on average, adjust as needed based on your corpus - if self.chars_per_doc == -1: - self.chars_per_doc = self.max_chars // 1000 # Default to 1000 documents if not specified, adjust as needed - if self.num_proc <= 0: - self.num_proc = min(32, (os.cpu_count() or 1) - 1) # Use all available CPUs minus one for training, adjust as needed + if self.training_params.max_chars == -1: + self.training_params.max_chars = int(self.vocab_size * 1000 * 2.5) + if self.training_params.chars_per_doc == -1: + self.training_params.chars_per_doc = self.training_params.max_chars // 1000 + if self.training_params.num_proc <= 0: + self.training_params.num_proc = min(32, (os.cpu_count() or 1) - 1) def save_to_directory(self, directory: Optional[Union[str, Path]] = None): if directory is not None: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 2dd60e2..d74744b 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,9 +1,20 @@ import pytest import random import string +from pathlib import Path -from gpt_lab.tokenizer import Tokenizer, TokenizerConfig +from gpt_lab.tokenizer.serialization import ( + load_mergeable_ranks, + save_mergeable_ranks, + validate_mergeable_ranks, + validate_no_special_token_overlap, +) +from gpt_lab.tokenizer.truncation import parse_truncated_name, truncated_from_pretrained from gpt_lab.utils.special_tokens import SpecialTokens +from gpt_lab.tokenizer import Tokenizer, TokenizerConfig +from gpt_lab.tokenizer import hf as tokenizer_hf +import gpt_lab.tokenizer.auto as tokenizer_auto + def make_dummy_dataset(size: int, max_seq_len: int): """Creates a dummy dataset of random token sequences for testing purposes. @@ -63,4 +74,289 @@ def test_train_tokenizer(dummy_small): decoded = tokenizer.decode(tokens) assert decoded == sample, "Decoded text does not match original sample after training simulation" import warnings - warnings.warn("This is dummy training test. TODO: Implement actual training logic and test it properly.") \ No newline at end of file + warnings.warn("This is dummy training test. TODO: Implement actual training logic and test it properly.") + +@pytest.mark.fast +def test_parse_truncated_name_valid_and_invalid(): + assert parse_truncated_name("gpt2_truncated_1024") == ("gpt2", 1024) + assert parse_truncated_name("foo/bar_truncated_32000") == ("foo/bar", 32000) + + assert parse_truncated_name("gpt2") is None + assert parse_truncated_name("gpt2_truncated_x") is None + assert parse_truncated_name("gpt2_truncated_") is None + + +@pytest.mark.fast +def test_save_load_mergeable_ranks_roundtrip_and_fingerprint_stable(tmp_path: Path): + path_a = tmp_path / "tok_a.msgpack" + path_b = tmp_path / "tok_b.msgpack" + + # Same logical mapping, different insertion order. + ranks_a = {b"b": 1, b"a": 0, b"ab": 2} + ranks_b = {b"ab": 2, b"a": 0, b"b": 1} + + fp_a = save_mergeable_ranks(path_a, ranks_a) + fp_b = save_mergeable_ranks(path_b, ranks_b) + + assert fp_a == fp_b + + loaded = load_mergeable_ranks(path_a) + assert loaded == {b"a": 0, b"b": 1, b"ab": 2} + + +@pytest.mark.fast +def test_load_mergeable_ranks_rejects_invalid_payloads(tmp_path: Path): + path = tmp_path / "invalid.msgpack" + + # Missing mergeable_ranks + path.write_bytes(__import__("msgpack").packb({"version": 1}, use_bin_type=True)) + with pytest.raises(ValueError, match="Missing 'mergeable_ranks'"): + load_mergeable_ranks(path) + + # Version must be int + path.write_bytes( + __import__("msgpack").packb({"version": "1", "mergeable_ranks": {}}, use_bin_type=True) + ) + with pytest.raises(ValueError, match="version' must be an integer"): + load_mergeable_ranks(path) + + # Unsupported version + path.write_bytes( + __import__("msgpack").packb({"version": 2, "mergeable_ranks": {}}, use_bin_type=True) + ) + with pytest.raises(ValueError, match="Unsupported tokenizer file version"): + load_mergeable_ranks(path) + + +@pytest.mark.fast +def test_validate_mergeable_ranks_and_special_overlap_errors(): + with pytest.raises(AssertionError, match="cannot have empty"): + validate_mergeable_ranks({}) + + with pytest.raises(AssertionError, match="non-empty bytes"): + validate_mergeable_ranks({"a": 0}) # type: ignore[arg-type] + + with pytest.raises(AssertionError, match="start at 0"): + validate_mergeable_ranks({b"a": 1}) + + with pytest.raises(AssertionError, match="not contiguous"): + validate_mergeable_ranks({b"a": 0, b"b": 2}) + + with pytest.raises(AssertionError, match="overlap"): + validate_no_special_token_overlap({b"<|bos|>": 0}, {"<|bos|>": 0}) + + +@pytest.mark.fast +def test_truncated_from_pretrained_rejects_vocab_below_byte_tokens(monkeypatch): + # Avoid disk lookups and base-tokenizer loading for this branch test. + monkeypatch.setattr( + "gpt_lab.tokenizer.tokenizer.Tokenizer.from_disk", + lambda *_args, **_kwargs: (_ for _ in ()).throw(FileNotFoundError("missing")), + ) + dummy_base = type("DummyBase", (), {"vocab_size": 300})() + monkeypatch.setattr( + "gpt_lab.tokenizer.tokenizer.Tokenizer.from_pretrained", + lambda *_args, **_kwargs: dummy_base, + ) + + with pytest.raises(ValueError, match="must retain all 256 byte-level tokens"): + truncated_from_pretrained( + base_name="gpt2", + new_vocab_size=len(SpecialTokens().list()) + 255, + source="tiktoken", + special_tokens=SpecialTokens(), + ) + + +@pytest.mark.fast +def test_truncated_from_pretrained_returns_base_when_target_not_smaller(monkeypatch): + class DummyBase: + def __init__(self): + self.vocab_size = 300 + self.mergeable_ranks = {bytes([i]): i for i in range(256)} + self.config = type("Cfg", (), {"source": "tiktoken", "pat_str": "x", "special_tokens": SpecialTokens()})() + + dummy_base = DummyBase() + + monkeypatch.setattr( + "gpt_lab.tokenizer.tokenizer.Tokenizer.from_disk", + lambda *_args, **_kwargs: (_ for _ in ()).throw(FileNotFoundError("missing")), + ) + monkeypatch.setattr( + "gpt_lab.tokenizer.tokenizer.Tokenizer.from_pretrained", + lambda *_args, **_kwargs: dummy_base, + ) + + out = truncated_from_pretrained( + base_name="gpt2", + new_vocab_size=dummy_base.vocab_size, + source="tiktoken", + special_tokens=SpecialTokens(), + ) + assert out is dummy_base + + +@pytest.mark.fast +def test_train_huggingface_from_iterator_requires_tokenizers(monkeypatch): + monkeypatch.setattr(tokenizer_hf, "HFTokenizer", None) + + cfg = type( + "DummyCfg", + (), + { + "vocab_size": 300, + "pat_str": "gpt2", + "show_progress": False, + "special_tokens": type("DummyST", (), {"list": lambda self: ["<|bos|>"]})(), + }, + )() + + with pytest.raises(RuntimeError, match="tokenizers library is required"): + tokenizer_hf.train_huggingface_from_iterator(["hello"], cfg) + + +@pytest.mark.fast +def test_hf_wrapper_encode_decode_and_special_tokens(): + class DummyAdded: + def __init__(self, content): + self.content = content + + class DummyEncodeOut: + def __init__(self, ids): + self.ids = ids + + class DummyMain: + def get_added_tokens_decoder(self): + return {0: DummyAdded(""), 1: DummyAdded("")} + + def encode(self, text, add_special_tokens=False): + _ = add_special_tokens + return DummyEncodeOut([len(text)]) + + def decode(self, ids, skip_special_tokens=False): + _ = skip_special_tokens + return f"decoded-{sum(ids)}" + + cfg = type("Cfg", (), {"name": "dummy", "source": "huggingface", "vocab_size": 10, "pat_str": None})() + wrapper = tokenizer_hf.HuggingFaceTokenizerWrapper(DummyMain(), cfg) + + assert wrapper.special_tokens == ["", ""] + assert wrapper.encode("abcd") == [4] + assert wrapper.decode([1, 2, 3]) == "decoded-6" + +@pytest.mark.fast +def test_compute_optimal_vocab_size_with_explicit_tokenizer_model(monkeypatch): + class DummyTokenizer: + vocab_size = 777 + + monkeypatch.setattr( + tokenizer_auto.Tokenizer, + "from_pretrained", + lambda name: DummyTokenizer(), + ) + + out = tokenizer_auto.compute_optimal_vocab_size( + depth=4, + aspect_ratio=16, + train_tokenizer=False, + tokenizer_model="gpt2", + special_tokens=SpecialTokens(), + ) + assert out == 777 + + +@pytest.mark.fast +def test_compute_optimal_vocab_size_raises_when_too_small(monkeypatch): + class DummyMetaModel: + n_params = 1 + + monkeypatch.setattr(tokenizer_auto, "build_meta_model", lambda _cfg: DummyMetaModel()) + + with pytest.raises(ValueError, match="<256"): + tokenizer_auto.compute_optimal_vocab_size( + depth=2, + aspect_ratio=8, + train_tokenizer=False, + tokenizer_model=None, + special_tokens=SpecialTokens(), + get_closest=lambda _x: ("tiny", 128), + ) + + +@pytest.mark.fast +def test_resolve_tokenizer_explicit_or_auto(monkeypatch): + monkeypatch.setattr( + tokenizer_auto, + "get_closest_tokenizer_size", + lambda _vocab_size: ("cl100k_base", 100000), + ) + + assert tokenizer_auto.resolve_tokenizer("gpt2", 32000, SpecialTokens()) == "gpt2" + assert tokenizer_auto.resolve_tokenizer(None, 32000, SpecialTokens()) == "cl100k_base" + assert tokenizer_auto.resolve_tokenizer("auto", 32000, SpecialTokens()) == "cl100k_base" + + +@pytest.mark.fast +def test_build_or_load_tokenizer_notrain_uses_pretrained(monkeypatch): + sentinel = object() + monkeypatch.setattr( + tokenizer_auto.Tokenizer, + "from_pretrained", + lambda _name: sentinel, + ) + + out = tokenizer_auto.build_or_load_tokenizer( + tname="gpt2", + vocab_size=32000, + train_tokenizer=False, + base_name="unused", + pat_str="gpt2", + special_tokens=SpecialTokens(), + data_dir="unused", + random_seed=42, + ) + assert out is sentinel + + +@pytest.mark.fast +def test_build_or_load_tokenizer_training_path(monkeypatch): + created_cfg = {} + + class FakeTrainerCfg: + def __init__(self, **kwargs): + created_cfg.update(kwargs) + + class FakeCorpus: + def iterator(self): + return iter(["abc", "def"]) + + sentinel = object() + + monkeypatch.setattr(tokenizer_auto, "TokenizerTrainerConfig", FakeTrainerCfg) + monkeypatch.setattr( + tokenizer_auto.TokenizerCorpus, + "from_sources", + lambda **_kwargs: FakeCorpus(), + ) + monkeypatch.setattr( + tokenizer_auto.Tokenizer, + "train_from_iterator", + lambda text_iterator, config: sentinel, + ) + + out = tokenizer_auto.build_or_load_tokenizer( + tname=None, + vocab_size=4096, + train_tokenizer=True, + base_name="my_tok", + pat_str="gpt2", + special_tokens=SpecialTokens(), + data_dir="/tmp/corpus", + random_seed=7, + dirname="/tmp/tokdir", + ) + + assert out is sentinel + assert created_cfg["name"] == "my_tok" + assert created_cfg["vocab_size"] == 4096 + assert created_cfg["dirname"] == "/tmp/tokdir" From 3dca1a3faaacb7ae79c5b6829e6331484a570a7d Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Mon, 18 May 2026 11:25:31 +0200 Subject: [PATCH 08/18] tokenizer: fix corpus import + hf log error + test --- src/gpt_lab/tokenizer/corpus.py | 83 ++++++++++++- src/gpt_lab/tokenizer/hf.py | 23 ++-- src/gpt_lab/utils/logging.py | 2 + tests/test_tokenizer.py | 213 +++++++++++++++++++++++++++++++- 4 files changed, 306 insertions(+), 15 deletions(-) diff --git a/src/gpt_lab/tokenizer/corpus.py b/src/gpt_lab/tokenizer/corpus.py index 3b56552..ed9594d 100644 --- a/src/gpt_lab/tokenizer/corpus.py +++ b/src/gpt_lab/tokenizer/corpus.py @@ -1,14 +1,18 @@ from pathlib import Path import random, pickle -from gpt_lab.utils.default import RANDOM_SEED, CACHE_DIR, DATA_DIR -from gpt_lab.data.loader import load_datasets +from gpt_lab.utils.default import RANDOM_SEED, DATA_DIR from gpt_lab.data.normalizers import clean_codeparrot_example from typing import Union, Dict, Callable, Optional, Iterable, Tuple # TODO: consider using compression.ztsd when python.version >= 3.14 (pi) -import zstd +# import zstd from tqdm import tqdm +try: + from datasets import load_dataset +except ImportError: + load_dataset = None + _fineweb_2_names_raw = ["rus_Cyrl", "cmn_Hani", "deu_Latn", "jpn_Jpan", "spa_Latn", "fra_Latn", "ita_Latn", "por_Latn", "pol_Latn", "nld_Latn", "ind_Latn", "vie_Latn", "fas_Arab", "arb_Arab", "tur_Latn", "tha_Thai", "ukr_Cyrl", "ell_Grek", "kor_Hang", "ces_Latn", "swe_Latn", "hun_Latn", "ron_Latn", "nob_Latn", "dan_Latn", "fin_Latn", "bul_Cyrl", "hin_Deva", "ben_Beng", "slk_Latn", "slk_Latn", "lit_Latn", "bos_Latn", "slv_Latn", "ekk_Latn", "cat_Latn", "tam_Taml", "hrv_Latn", "lvs_Latn", "zsm_Latn", "azj_Latn", "srp_Cyrl", "kat_Geor", "npi_Deva", "mar_Deva", "nno_Latn"] _fineweb_2_names = [] @@ -19,6 +23,79 @@ alph.append(script) _fineweb_2_names.append(lang) + +def load_datasets( + # { "path": str, "name": str (optional), "weight": float (optional), "hook": Callable (optional) } + sources: Iterable[Dict[str, Union[str, float, Callable]]], + data_dir: Union[str,Path] = DATA_DIR, + split: str = "train", + streaming: bool = True, + shuffle: bool = True, + random_seed: int = 42, + *args, **kwargs + ) -> Dict[str, Iterable]: + ds = dict() + for src in sources: + path, name = src["path"], src.get("name", None) + ds_name = path if name is None else f"{path}:{name}" + ds_split = src.get("split", split) + ds_hook = src.get("hook", lambda x: x) + _ds = ds_hook( + load_dataset( + path, + name=name, + split=ds_split, + streaming=streaming, + cache_dir=data_dir, + *args, **kwargs + ) + ) + if "filter_fn" in src: + _ds = _ds.filter(src["filter_fn"]) + if shuffle and streaming: + _ds = _ds.shuffle(seed=random_seed) + ds[ds_name] = _ds + return ds + +def weighted_sample_generator(streams, prng): + """ + streams: list of (iterable, weight) + yields items from one of the streams according to weights. + Designed to keep reading from selected stream until exhausted (streams are long) + For streaming HF datasets these are effectively infinite for training; we just sample. + """ + # convert weights to cumulative thresholds + total = sum(w for _, w in streams) + cum = [] + acc = 0.0 + for _, w in streams: + acc += w / total + cum.append(acc) + + # create iterators + iterators = [iter(s) for s, _ in streams] + while True: + p = prng.random() + # pick which stream index + idx = 0 + while p > cum[idx]: + idx += 1 + try: + yield next(iterators[idx]) + except StopIteration: + # If a stream ends, remove it from selection. + # For HF streaming this is unlikely; but handle gracefully. + iterators.pop(idx) + streams.pop(idx) + cum = [] + total = sum(w for _, w in streams) if streams else 0 + acc = 0.0 + for _, w in streams: + acc += w / total + cum.append(acc) + if not streams: + break + def display_stat_by_source(stat_by_source: Dict[str, Dict[str, int]]): from rich.console import Console from rich.markdown import Markdown diff --git a/src/gpt_lab/tokenizer/hf.py b/src/gpt_lab/tokenizer/hf.py index a867dc4..2b46a79 100644 --- a/src/gpt_lab/tokenizer/hf.py +++ b/src/gpt_lab/tokenizer/hf.py @@ -4,21 +4,26 @@ from pathlib import Path from typing import Iterable, Dict -from gpt_lab.utils.logging import log_all +from gpt_lab.utils.logging import log_all, log0, log_error from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig from gpt_lab.utils.special_tokens import SpecialTokens from gpt_lab.tokenizer.base import _BaseTokenizer +import logging + +logger = logging.getLogger(__name__) + try: from tokenizers import Tokenizer as HFTokenizer from tokenizers import decoders, pre_tokenizers, Regex from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer -except Exception: +except Exception as e: + log0(f"Failed to import HuggingFace tokenizers library: {e}. " \ + "HuggingFace tokenizer functionality will be unavailable. " \ + "To use HuggingFace tokenizers, please install the 'tokenizers' library via pip.", + level="warning", logger=logger) HFTokenizer = None -import logging - -logger = logging.getLogger(__name__) class HuggingFaceTokenizerWrapper(_BaseTokenizer): """Light wrapper around HuggingFace `tokenizers` tokenizer. @@ -39,7 +44,7 @@ def special_tokens(self): @classmethod def from_pretrained(cls, hf_path: str): if HFTokenizer is None: - log_all("tokenizers library not available, cannot load HuggingFace tokenizer", level="error", logger=logger) + log_error("tokenizers library is required to load HuggingFace tokenizer", logger=logger, error_type=ImportError) tokenizer = HFTokenizer.from_pretrained(hf_path) config = TokenizerConfig( name=hf_path, @@ -53,7 +58,7 @@ def from_pretrained(cls, hf_path: str): @classmethod def from_directory(cls, tokenizer_dir: str): if HFTokenizer is None: - log_all("tokenizers library not available, cannot load HuggingFace tokenizer", level="error", logger=logger) + log_error("tokenizers library is required to load HuggingFace tokenizer", logger=logger, error_type=ImportError) tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") tokenizer = HFTokenizer.from_file(tokenizer_path) config = TokenizerConfig( @@ -86,8 +91,8 @@ def train_huggingface_from_iterator(text_iterator: Iterable[str], config: Tokeni Returns a dict mapping byte-strings to ranks (integers). """ if HFTokenizer is None: - log_all("tokenizers library is required for HuggingFace trainer", level="error", logger=logger) - + log_error("tokenizers library is required for HuggingFace trainer", logger=logger, error_type=ImportError) + tknzr = HFTokenizer( BPE( byte_fallback=True, diff --git a/src/gpt_lab/utils/logging.py b/src/gpt_lab/utils/logging.py index 1a92ff8..faacad2 100644 --- a/src/gpt_lab/utils/logging.py +++ b/src/gpt_lab/utils/logging.py @@ -115,6 +115,8 @@ def log_all(msg, level=logging.ERROR, logger=logger): if isinstance(level, str): level = log_levels.get(level.upper(), logging.ERROR) logger.log(level, _with_rank(msg), stacklevel=3) + if level >= logging.ERROR: + raise RuntimeError(msg) def log_dict(title, info, logger=logger, level=logging.INFO, only_rank0=True, structured=False): level = _get_level(level) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index d74744b..3cb1005 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -2,6 +2,7 @@ import random import string from pathlib import Path +import json from gpt_lab.tokenizer.serialization import ( load_mergeable_ranks, @@ -13,6 +14,7 @@ from gpt_lab.utils.special_tokens import SpecialTokens from gpt_lab.tokenizer import Tokenizer, TokenizerConfig from gpt_lab.tokenizer import hf as tokenizer_hf +from gpt_lab.tokenizer.corpus import TokenizerCorpus import gpt_lab.tokenizer.auto as tokenizer_auto @@ -196,8 +198,129 @@ def __init__(self): assert out is dummy_base +@pytest.mark.fast +def test_compute_optimal_vocab_size_with_explicit_tokenizer_model(monkeypatch): + class DummyTokenizer: + vocab_size = 777 + + monkeypatch.setattr( + "gpt_lab.tokenizer.auto.Tokenizer.from_pretrained", + lambda name: DummyTokenizer(), + ) + + import gpt_lab.tokenizer.auto as tokenizer_auto + out = tokenizer_auto.compute_optimal_vocab_size( + depth=4, + aspect_ratio=16, + train_tokenizer=False, + tokenizer_model="gpt2", + special_tokens=SpecialTokens(), + ) + assert out == 777 + + +@pytest.mark.fast +def test_compute_optimal_vocab_size_raises_when_too_small(monkeypatch): + class DummyMetaModel: + n_params = 1 + + import gpt_lab.model.checkpoint as mcheck + monkeypatch.setattr(mcheck, "build_meta_model", lambda _cfg: DummyMetaModel()) + import gpt_lab.tokenizer.auto as tokenizer_auto + + with pytest.raises(ValueError, match="<256"): + tokenizer_auto.compute_optimal_vocab_size( + depth=2, + aspect_ratio=8, + train_tokenizer=False, + tokenizer_model=None, + special_tokens=SpecialTokens(), + get_closest=lambda _x: ("tiny", 128), + ) + + +@pytest.mark.fast +def test_resolve_tokenizer_explicit_or_auto(monkeypatch): + import gpt_lab.tokenizer.auto as tokenizer_auto + monkeypatch.setattr( + tokenizer_auto, + "get_closest_tokenizer_size", + lambda _vocab_size: ("cl100k_base", 100000), + ) + + assert tokenizer_auto.resolve_tokenizer("gpt2", 32000, SpecialTokens()) == "gpt2" + assert tokenizer_auto.resolve_tokenizer(None, 32000, SpecialTokens()) == "cl100k_base" + assert tokenizer_auto.resolve_tokenizer("auto", 32000, SpecialTokens()) == "cl100k_base" + + +@pytest.mark.fast +def test_build_or_load_tokenizer_notrain_uses_pretrained(monkeypatch): + import gpt_lab.tokenizer.auto as tokenizer_auto + sentinel = object() + monkeypatch.setattr( + "gpt_lab.tokenizer.auto.Tokenizer.from_pretrained", + lambda _name: sentinel, + ) + + out = tokenizer_auto.build_or_load_tokenizer( + tname="gpt2", + vocab_size=32000, + train_tokenizer=False, + base_name="unused", + pat_str="gpt2", + special_tokens=SpecialTokens(), + data_dir="unused", + random_seed=42, + ) + assert out is sentinel + + +@pytest.mark.fast +def test_build_or_load_tokenizer_training_path(monkeypatch): + import gpt_lab.tokenizer.auto as tokenizer_auto + created_cfg = {} + + class FakeTrainerCfg: + def __init__(self, **kwargs): + created_cfg.update(kwargs) + + class FakeCorpus: + def iterator(self): + return iter(["abc", "def"]) + + sentinel = object() + + monkeypatch.setattr(tokenizer_auto, "TokenizerTrainerConfig", FakeTrainerCfg) + monkeypatch.setattr( + "gpt_lab.tokenizer.auto.TokenizerCorpus.from_sources", + lambda **_kwargs: FakeCorpus(), + ) + monkeypatch.setattr( + "gpt_lab.tokenizer.auto.Tokenizer.train_from_iterator", + lambda text_iterator, config: sentinel, + ) + + out = tokenizer_auto.build_or_load_tokenizer( + tname=None, + vocab_size=4096, + train_tokenizer=True, + base_name="my_tok", + pat_str="gpt2", + special_tokens=SpecialTokens(), + data_dir="/tmp/corpus", + random_seed=7, + dirname="/tmp/tokdir", + ) + + assert out is sentinel + assert created_cfg["name"] == "my_tok" + assert created_cfg["vocab_size"] == 4096 + assert created_cfg["dirname"] == "/tmp/tokdir" + + @pytest.mark.fast def test_train_huggingface_from_iterator_requires_tokenizers(monkeypatch): + import gpt_lab.tokenizer.hf as tokenizer_hf monkeypatch.setattr(tokenizer_hf, "HFTokenizer", None) cfg = type( @@ -211,12 +334,14 @@ def test_train_huggingface_from_iterator_requires_tokenizers(monkeypatch): }, )() - with pytest.raises(RuntimeError, match="tokenizers library is required"): + with pytest.raises(Exception): tokenizer_hf.train_huggingface_from_iterator(["hello"], cfg) @pytest.mark.fast def test_hf_wrapper_encode_decode_and_special_tokens(): + import gpt_lab.tokenizer.hf as tokenizer_hf + class DummyAdded: def __init__(self, content): self.content = content @@ -243,6 +368,87 @@ def decode(self, ids, skip_special_tokens=False): assert wrapper.special_tokens == ["", ""] assert wrapper.encode("abcd") == [4] assert wrapper.decode([1, 2, 3]) == "decoded-6" + assert wrapper.decode([1, 2, 3]) == "decoded-6" + + +@pytest.mark.fast +def test_train_huggingface_from_iterator_with_mock_tokenizers(monkeypatch): + import gpt_lab.tokenizer.hf as tokenizer_hf + + # Create dummy components that mimic the API used in hf.train_huggingface_from_iterator + class DummyTokenizer: + def __init__(self, *args, **kwargs): + self._merges = [["a", "b"], ["Ġx", "y"]] + self.normalizer = None + self.pre_tokenizer = None + self.decoder = None + self.post_processor = None + + def train_from_iterator(self, iterator, trainer=None): + # no-op: we already have _merges + return None + + def to_str(self): + return json.dumps({"model": {"merges": self._merges}}) + + def get_vocab_size(self): + return 123 + + def save(self, path): + Path(path).write_text(self.to_str()) + + class DummyBPE: + def __init__(self, *args, **kwargs): + pass + + class DummyRegex: + def __init__(self, pat): + self.pat = pat + + class DummyPreTokenizers: + class Split: + def __init__(self, pattern=None, behavior=None, invert=None): + self.pattern = pattern + + class ByteLevel: + @staticmethod + def alphabet(): + return [0, 1, 2] + + def __init__(self, add_prefix_space=False, use_regex=False): + pass + + @staticmethod + def Sequence(items): + return items + + class DummyDecoders: + class ByteLevel: + def __init__(self): + pass + + class DummyBpeTrainer: + def __init__(self, *args, **kwargs): + self.show_progress = kwargs.get("show_progress", False) + + # Monkeypatch the tokenizer_hf module-level names to our dummies + monkeypatch.setattr(tokenizer_hf, "HFTokenizer", DummyTokenizer) + monkeypatch.setattr(tokenizer_hf, "BPE", DummyBPE) + monkeypatch.setattr(tokenizer_hf, "Regex", DummyRegex) + monkeypatch.setattr(tokenizer_hf, "pre_tokenizers", DummyPreTokenizers) + monkeypatch.setattr(tokenizer_hf, "decoders", DummyDecoders) + monkeypatch.setattr(tokenizer_hf, "BpeTrainer", DummyBpeTrainer) + + # Minimal trainer config + cfg = type("DummyCfg", (), {"vocab_size": 300, "pat_str": "\\w+", "show_progress": False, "special_tokens": SpecialTokens()})() + + out = tokenizer_hf.train_huggingface_from_iterator(["hello world"], cfg) + + # Expect mergeable ranks to be a dict mapping bytes to ints and include single-byte entries + assert isinstance(out, dict) + # Check that merged pair "a","b" became bytes key + assert any(isinstance(k, (bytes, bytearray)) for k in out.keys()) + assert out.get(bytes([0])) == 0 @pytest.mark.fast def test_compute_optimal_vocab_size_with_explicit_tokenizer_model(monkeypatch): @@ -270,7 +476,8 @@ def test_compute_optimal_vocab_size_raises_when_too_small(monkeypatch): class DummyMetaModel: n_params = 1 - monkeypatch.setattr(tokenizer_auto, "build_meta_model", lambda _cfg: DummyMetaModel()) + import gpt_lab.model.checkpoint as mcheck + monkeypatch.setattr(mcheck, "build_meta_model", lambda _cfg: DummyMetaModel()) with pytest.raises(ValueError, match="<256"): tokenizer_auto.compute_optimal_vocab_size( @@ -334,7 +541,7 @@ def iterator(self): monkeypatch.setattr(tokenizer_auto, "TokenizerTrainerConfig", FakeTrainerCfg) monkeypatch.setattr( - tokenizer_auto.TokenizerCorpus, + TokenizerCorpus, "from_sources", lambda **_kwargs: FakeCorpus(), ) From bde29121e77d42315553f70ada67274548347dd0 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Tue, 19 May 2026 14:46:50 +0200 Subject: [PATCH 09/18] tokenizer corpus: introduce byte control over char control --- scripts/scaling_tokenizer.py | 104 +++++++--- src/gpt_lab/tokenizer/base.py | 29 +-- src/gpt_lab/tokenizer/corpus.py | 290 ++++++++++++++++------------ src/gpt_lab/tokenizer/tokenizer.py | 18 +- src/gpt_lab/tokenizer/truncation.py | 2 + src/gpt_lab/utils/schemas.py | 127 ++++-------- tests/test_tokenizer.py | 14 +- 7 files changed, 312 insertions(+), 272 deletions(-) diff --git a/scripts/scaling_tokenizer.py b/scripts/scaling_tokenizer.py index 82ac26b..682b704 100644 --- a/scripts/scaling_tokenizer.py +++ b/scripts/scaling_tokenizer.py @@ -11,6 +11,8 @@ import regex as re +BASELINES = ["gpt2", "cl100k_base", "o200k_base"] + def load_all_results(path): results = [] with open(path, "rb") as f: @@ -94,11 +96,12 @@ def get_eval_corpus(eval_set): return TokenizerCorpus.from_sources( corpus_dir=eval_set["localdir"], sources=[eval_set["generator_source"]], - max_chars=500_000, # just for evaluation, we can use a subset of the data - chars_per_doc=10_000, + # max_chars=500_000, # just for evaluation, we can use a subset of the data + max_bytes=500_000 * 4, # just for evaluation, we can use a subset of the data, adjust as needed based on your corpus + bytes_per_doc=4096 * 4, split=eval_set["split"], compressed=True, - shard_size_chars=50_000, + shard_size_bytes=50_000 * 4, loader_fn=eval_set.get("loader_fn", None), # will overwrite for enwik8 ) @@ -144,14 +147,33 @@ def eval_tokenizer(tokenizer): results[eval_set["metricname"]] = res return results +def compare_with_truncated_baselines(target_vocab_size): + comparisons = {} + from tiktoken import get_encoding + + for baseline in BASELINES: + baseline_vocab_size = get_encoding(baseline).n_vocab + if baseline_vocab_size <= target_vocab_size: + continue + + truncated_name = f"{baseline}_truncated_{target_vocab_size}" + truncated_tokenizer = Tokenizer.from_pretrained(truncated_name) + comparisons[baseline] = { + "base_vocab_size": baseline_vocab_size, + "truncated_name": truncated_name, + "evaluation": eval_tokenizer(truncated_tokenizer), + } + + return comparisons + def run_tokenizer_experiment(task): ( vocab_size, p_str_name, p_str, - max_char, + max_bytes, corpus_path, - corpus_charmax, + corpus_bytemax, seed, name, ) = task @@ -159,13 +181,13 @@ def run_tokenizer_experiment(task): corpus = TokenizerCorpus.from_sources( corpus_dir=corpus_path, sources=None, - max_chars=corpus_charmax, - chars_per_doc=corpus_charmax // 10_000, + max_bytes=corpus_bytemax, + bytes_per_doc=corpus_bytemax // 10_000, random_seed=seed, ) config = TokenizerTrainerConfig( - max_chars=max_char, - chars_per_doc=max_char // 1000, + max_bytes=max_bytes, + bytes_per_doc=max_bytes // 10_000, vocab_size=vocab_size, name=name, num_proc=num_procs, @@ -177,7 +199,7 @@ def run_tokenizer_experiment(task): ) t0 = time.time() tokenizer = Tokenizer.train_from_iterator( - text_iterator=corpus.iterator(max_chars=max_char), + text_iterator=corpus.iterator(max_bytes=max_bytes), config=config, ) t1 = time.time() @@ -185,13 +207,14 @@ def run_tokenizer_experiment(task): result = { "vocab_size": vocab_size, "pattern": p_str_name, - "max_chars": max_char, + "max_bytes": max_bytes, + "tokenizer_name": name, "config": str(config), "training_time": t1 - t0, "corpus_size_mb": corpus_path.stat().st_size / 1e6, } - for text in corpus.iterator(max_chars=max_char): + for text in corpus.iterator(max_bytes=max_bytes): result["nb_chars_trained"] = result.get("nb_chars_trained", 0) + len(text) result["nb_words_trained"] = result.get("nb_words_trained", 0) + len(text.split()) result["nb_bytes_trained"] = result.get("nb_bytes_trained", 0) + len(text.encode("utf-8")) @@ -203,13 +226,15 @@ def run_tokenizer_experiment(task): del tokenizer return result -char_per_doc = lambda max_char: max_char // 10_000 # Default to 1000 documents if not specified, adjust as needed +byte_per_doc = lambda max_byte: max_byte // 10_000 # Default to 1000 documents if not specified, adjust as needed def main(): parser = argparse.ArgumentParser(description="Find the optimal corpus size for training a BPE tokenizer with different vocabulary sizes, and evaluate the trained tokenizers on a simple test set to analyze the trade-offs between corpus size, vocabulary size, training time, and tokenization quality.") parser.add_argument("--write-corpus", action="store_true", help="Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk.") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--results-path", type=str, default=str(TOKENIZERS_FOLDER / 'scaling_tokenizer_results.pkl'), help="Path to store the results of the tokenizer evaluations.") + parser.add_argument("--compare-truncated-baselines", action="store_true", help="Whether to compare trained tokenizers with truncated versions of baseline tokenizers.") + parser.add_argument("--corpus-temperature-alpha", type=float, default=None, help="Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity.") args = parser.parse_args() import os num_procs = min(os.cpu_count(), 32) @@ -260,11 +285,10 @@ def store_results(results_batch, path=results_path): # Baselines: gpt2, cl100k_base, o200k_base from tiktoken import get_encoding - baselines = ["gpt2", "cl100k_base", "o200k_base"] results = load_all_results(results_path) if results_path.exists() else [] if len(results) == 0: results = [] - for baseline in baselines: + for baseline in BASELINES: enc = get_encoding(baseline) evaluation = eval_tokenizer(enc) result = dict( @@ -285,31 +309,32 @@ def store_results(results_batch, path=results_path): patterns = { "pat_str-gpt2": PAT_STR_GPT2, "pat_str-gpt4": PAT_STR_GPT4, "pat_str-punct": PAT_STR_punct, "pat_str-cl100k_base": PAT_STR_cl100k_base, "pat_str-o200k_base": PAT_STR_o200k_base } # patterns = { "PAT_STR_o200k_base": PAT_STR_o200k_base } # TODO: optimize by running the biggest vocab size and slice it on top-k merges for smaller vocabs - vocab_sizes = [10_000, 20_000, 30_000, 50_000, 100_000, 200_000, 300_000, 500_000] + # vocab_sizes = [10_000, 20_000, 30_000, 50_000, 100_000, 200_000, 300_000, 500_000] + vocab_sizes = [30_000, 50_000, 70_000, 100_000, 200_000] # vocab_sizes = list(reversed(vocab_sizes)) - _max_char_runs = 8 - max_chars = lambda vocab_size: [int(vocab_size * i * 500) for i in range(1, _max_char_runs+1)] # ~3.5 characters per token on average, adjust as needed based on your corpus + _max_char_runs = 8 # adjust the divisor to control how many runs are done before storing results to disk, this is a trade-off between memory usage and frequency of saving intermediate results. With 3 processes, we can afford to do more runs before saving, but if you have more memory constraints, you might want to save more frequently by using a smaller divisor. + max_bytes = lambda vocab_size: [int(vocab_size * i * 512) for i in range(1, _max_char_runs+1)] # ~3.5 characters per token on average, adjust as needed based on your corpus # Two options: same name for all tokenizers -> overwrite / different names -> many tokenizers on disk, consider cleaning up after training or implementing a caching mechanism to avoid retraining the same tokenizer multiple times. # name = lambda vocab_size, max_char, p_str_name: f"ic1-tok-{int(vocab_size//1000)}k_maxchar-{max_char//1e6:.1f}M_pattern-{p_str_name}" - name = "ic1-scaling-tok" print(f"Using {num_procs} processes for tokenizer training.") corpus_path = DATA_DIR / "corpus" / results_path.stem results = [] - corpus_charmax = max(max_chars(max(vocab_sizes))) + corpus_bytemax = max(max_bytes(max(vocab_sizes))) if args.write_corpus: - print(f"Writing corpus to {corpus_path} with max chars {corpus_charmax:,}...") + print(f"Writing corpus to {corpus_path} with max bytes {corpus_bytemax:,}...") corpus = TokenizerCorpus.write_from_sources( corpus_dir=corpus_path, - max_chars=corpus_charmax, - chars_per_doc=char_per_doc(corpus_charmax), + max_bytes=corpus_bytemax, + bytes_per_doc=byte_per_doc(corpus_bytemax), random_seed=args.seed, + temperature_alpha=args.corpus_temperature_alpha, ) print(f"Corpus written to {corpus_path}. Size: {sum(c.stat().st_size / 1e6 for c in corpus_path.glob('*.txt')):.2f} MB") # Prepare run configurations if not args.write_corpus: - print(f"Using existing corpus at {corpus_path} with max chars {corpus_charmax:,} for tokenizer training.") + print(f"Using existing corpus at {corpus_path} with max bytes {corpus_bytemax:,} for tokenizer training.") if not corpus_path.exists(): raise FileNotFoundError(f"Corpus path {corpus_path} does not exist. Please run the script with --write-corpus flag to create the corpus before training tokenizers.") corpus = TokenizerCorpus.from_sources(corpus_dir=corpus_path) @@ -318,17 +343,17 @@ def store_results(results_batch, path=results_path): for vocab_size in vocab_sizes: for p_str_name, p_str in patterns.items(): - for max_char in max_chars(vocab_size): + for max_byte in max_bytes(vocab_size): tasks.append( ( vocab_size, p_str_name, p_str, - max_char, + max_byte, corpus_path, - corpus_charmax, + corpus_bytemax, args.seed, - name, + f"ic1-scaling-tok-{p_str_name}-v{vocab_size}-b{max_byte//1e6:.1f}M", ) ) @@ -353,15 +378,34 @@ def store_results(results_batch, path=results_path): buffer = [] for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Tokenizer experiments")): - buffer.append(future.result()) + result = future.result() + buffer.append(result) + results.append(result) - if i % _max_char_runs == 0: + if len(buffer) >= _max_char_runs: store_results(buffer) buffer.clear() if buffer: store_results(buffer) + if args.compare_truncated_baselines: + comparison_records = [] + for entry in results: + if entry.get("baseline") is not None: + continue + target_vocab_size = entry["vocab_size"] + comparison_records.append({ + "comparison_for": entry.get("tokenizer_name"), + "vocab_size": target_vocab_size, + "pattern": entry.get("pattern"), + "max_chars": entry.get("max_chars"), + "max_bytes": entry.get("max_bytes"), + "truncated_baseline_evaluations": compare_with_truncated_baselines(target_vocab_size), + }) + if comparison_records: + store_results(comparison_records) + print(f"Total time for all runs: {(time.time() - t_total_start)/3600:.2f} hours.") print(f"All runs completed. Results stored in {results_path}.") diff --git a/src/gpt_lab/tokenizer/base.py b/src/gpt_lab/tokenizer/base.py index 09e35bc..701b07c 100644 --- a/src/gpt_lab/tokenizer/base.py +++ b/src/gpt_lab/tokenizer/base.py @@ -23,20 +23,21 @@ def __init__(self, config: Optional[TokenizerConfig] = None): self.config = config self.special_tokens = None self.mergeable_ranks = None - try: - self.token_bytes = self.get_token_bytes() - except Exception as e: - log0(f"Failed to get token bytes during initialization: {e}. " \ - f"This may cause issues with optimizers that rely on token byte lengths. "\ - "You can try calling get_token_bytes() manually after initialization to see the full error message and debug the issue.", - level="warning", logger=logger) + # disable token_bytes inits for scaling tok + # try: + # self.token_bytes = self.get_token_bytes() + # except Exception as e: + # log0(f"Failed to get token bytes during initialization: {e}. " \ + # f"This may cause issues with optimizers that rely on token byte lengths. "\ + # "You can try calling get_token_bytes() manually after initialization to see the full error message and debug the issue.", + # level="warning", logger=logger) def get_vocab(self): return {**self.mergeable_ranks, **self.special_tokens} @property def vocab_size(self): - "vocab_size value icludes both mergeable ranks and special tokens" + "vocab_size value includes both mergeable ranks and special tokens" return len(self.mergeable_ranks) + len(self.special_tokens) @property @@ -46,11 +47,17 @@ def n_special_tokens(self): @property def n_ranks(self): return len(self.mergeable_ranks) + + @property + def token_bytes(self): + if getattr(self, "_token_bytes", None) is None: + self._token_bytes = self.get_token_bytes() + return self._token_bytes def get_token_bytes(self): token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" - if getattr(self, "token_bytes", None) is not None: - return self.token_bytes + if getattr(self, "_token_bytes", None) is not None: + return self._token_bytes if token_bytes_path.exists(): token_bytes = torch.load(token_bytes_path) @@ -68,7 +75,7 @@ def get_token_bytes(self): torch.save(token_bytes, f) log0(f"Saved token_bytes to {token_bytes_path}", logger=logger) - self.token_bytes = token_bytes + self._token_bytes = token_bytes return token_bytes def __call__(self, text, *args, **kwds): diff --git a/src/gpt_lab/tokenizer/corpus.py b/src/gpt_lab/tokenizer/corpus.py index ed9594d..d1cf66f 100644 --- a/src/gpt_lab/tokenizer/corpus.py +++ b/src/gpt_lab/tokenizer/corpus.py @@ -2,10 +2,10 @@ import random, pickle from gpt_lab.utils.default import RANDOM_SEED, DATA_DIR from gpt_lab.data.normalizers import clean_codeparrot_example +from gpt_lab.utils.logging import log0 from typing import Union, Dict, Callable, Optional, Iterable, Tuple # TODO: consider using compression.ztsd when python.version >= 3.14 (pi) -# import zstd from tqdm import tqdm try: @@ -13,6 +13,9 @@ except ImportError: load_dataset = None +import logging + +logger = logging.getLogger(__name__) _fineweb_2_names_raw = ["rus_Cyrl", "cmn_Hani", "deu_Latn", "jpn_Jpan", "spa_Latn", "fra_Latn", "ita_Latn", "por_Latn", "pol_Latn", "nld_Latn", "ind_Latn", "vie_Latn", "fas_Arab", "arb_Arab", "tur_Latn", "tha_Thai", "ukr_Cyrl", "ell_Grek", "kor_Hang", "ces_Latn", "swe_Latn", "hun_Latn", "ron_Latn", "nob_Latn", "dan_Latn", "fin_Latn", "bul_Cyrl", "hin_Deva", "ben_Beng", "slk_Latn", "slk_Latn", "lit_Latn", "bos_Latn", "slv_Latn", "ekk_Latn", "cat_Latn", "tam_Taml", "hrv_Latn", "lvs_Latn", "zsm_Latn", "azj_Latn", "srp_Cyrl", "kat_Geor", "npi_Deva", "mar_Deva", "nno_Latn"] _fineweb_2_names = [] @@ -23,6 +26,53 @@ alph.append(script) _fineweb_2_names.append(lang) +def apply_temperature_sampling( + sources, + alpha: float = 0.5, + min_weight: float = 0.0, +): + """ + Temperature sampling over dataset weights. + + p_i ∝ w_i^alpha + + alpha < 1: + flattens distribution + boosts smaller datasets + + alpha = 1: + original distribution + + alpha = 0: + uniform sampling + """ + raw = [max(src.get("weight", 1.0), min_weight) for src in sources] + scaled = [w ** alpha for w in raw] + total = sum(scaled) + + for src, w in zip(sources, scaled): + src["weight"] = w / total + + return sources + +def safe_byte_truncate(text: str, max_bytes: int) -> str: + encoded = text.encode("utf-8") + if len(encoded) <= max_bytes: + return text + truncated = encoded[:max_bytes] + # walk backward until valid UTF-8 + while truncated: + try: + return truncated.decode("utf-8") + except UnicodeDecodeError: + truncated = truncated[:-1] + return "" + +def _normalize_sources_weights(sources, weights_sum): + """Attach a stable `name` key to each source dict. Call once before any loop.""" + for src in sources: + src["weight"] = src.get("weight", 1.0) / weights_sum + return sources def load_datasets( # { "path": str, "name": str (optional), "weight": float (optional), "hook": Callable (optional) } @@ -32,9 +82,10 @@ def load_datasets( streaming: bool = True, shuffle: bool = True, random_seed: int = 42, - *args, **kwargs + loader_kwargs: dict = None ) -> Dict[str, Iterable]: ds = dict() + loader_kwargs = loader_kwargs or {} for src in sources: path, name = src["path"], src.get("name", None) ds_name = path if name is None else f"{path}:{name}" @@ -47,7 +98,7 @@ def load_datasets( split=ds_split, streaming=streaming, cache_dir=data_dir, - *args, **kwargs + **loader_kwargs ) ) if "filter_fn" in src: @@ -57,45 +108,6 @@ def load_datasets( ds[ds_name] = _ds return ds -def weighted_sample_generator(streams, prng): - """ - streams: list of (iterable, weight) - yields items from one of the streams according to weights. - Designed to keep reading from selected stream until exhausted (streams are long) - For streaming HF datasets these are effectively infinite for training; we just sample. - """ - # convert weights to cumulative thresholds - total = sum(w for _, w in streams) - cum = [] - acc = 0.0 - for _, w in streams: - acc += w / total - cum.append(acc) - - # create iterators - iterators = [iter(s) for s, _ in streams] - while True: - p = prng.random() - # pick which stream index - idx = 0 - while p > cum[idx]: - idx += 1 - try: - yield next(iterators[idx]) - except StopIteration: - # If a stream ends, remove it from selection. - # For HF streaming this is unlikely; but handle gracefully. - iterators.pop(idx) - streams.pop(idx) - cum = [] - total = sum(w for _, w in streams) if streams else 0 - acc = 0.0 - for _, w in streams: - acc += w / total - cum.append(acc) - if not streams: - break - def display_stat_by_source(stat_by_source: Dict[str, Dict[str, int]]): from rich.console import Console from rich.markdown import Markdown @@ -120,10 +132,11 @@ def display_stat_by_source(stat_by_source: Dict[str, Dict[str, int]]): class TokenizerCorpus: def __init__( self, - total_chars: int, + total_bytes: int, total_docs: int, corpus_dir: Union[str, Path], random_seed: int = RANDOM_SEED, + total_chars: Optional[int] = None, sources: Optional[dict] = None, compressed: bool = False, stat_by_source: Optional[Dict[str, Dict[str, int]]] = None, @@ -131,6 +144,7 @@ def __init__( corpus_dir = TokenizerCorpus.init_corpusdir(corpus_dir) self.corpus_dir = corpus_dir self.random_seed = random_seed + self.total_bytes = total_bytes self.total_chars = total_chars self.total_docs = total_docs if sources is not None: @@ -152,14 +166,14 @@ def save(self): with open(self.meta_path, "wb") as f: pickle.dump(self, f) - def iterator(self, max_chars: Optional[int] = None) -> Iterable[str]: - char_count = 0 + def iterator(self, max_bytes: Optional[int] = None) -> Iterable[str]: + byte_count = 0 for shard in self.shard_paths(): with self.open_text_file(shard) as f: for line in f: yield line.strip() - char_count += len(line) - if max_chars and char_count >= max_chars: + byte_count += len(line) + if max_bytes and byte_count >= max_bytes: return @staticmethod @@ -181,31 +195,37 @@ def write_from_sources( cls, corpus_dir: Union[str, Path], sources: Optional[dict] = None, # dict ds_name: weight, - chars_per_doc: int = 10_000, - max_chars: int = 1_000_000_000, + bytes_per_doc: int = 10_000, + max_bytes: int = 1_000_000_000, random_seed: int = RANDOM_SEED, split: str = "train", + temperature_alpha: Optional[float] = None, compressed: bool = False, # False: .txt, True: sharded .txt.zst (optimized memory for large corpora) - shard_size_chars: Optional[int] = None, # only relevant if compressed=True; if None, defaults to max_chars (i.e. single shard) + shard_size_bytes: Optional[int] = None, # only relevant if compressed=True; if None, defaults to max_bytes (i.e. single shard) ): - if shard_size_chars is None: - shard_size_chars = max_chars // 10 if compressed else max_chars # heuristic for shard size; adjust as needed + if shard_size_bytes is None: + shard_size_bytes = max_bytes // 10 if compressed else max_bytes # heuristic for shard size; adjust as needed # TODO: fix zstd comp + if compressed: + log0("Warning: compressed corpus writing is not yet implemented; writing uncompressed .txt files instead", logger=logger) compressed = False + corpus_dir = TokenizerCorpus.init_corpusdir(corpus_dir) - char_count, doc_count, stat_by_source = write_corpus_sample( + bytes_count, char_count, doc_count, stat_by_source = write_corpus_sample( sources=sources, - chars_per_doc=chars_per_doc, - max_chars=max_chars, + bytes_per_doc=bytes_per_doc, + max_bytes=max_bytes, corpus_dir=corpus_dir, random_seed=random_seed, + temperature_alpha=temperature_alpha, split=split, - shard_size_chars=shard_size_chars, + shard_size_bytes=shard_size_bytes, compressed=compressed, ) display_stat_by_source(stat_by_source) meta = cls( corpus_dir=corpus_dir, + total_bytes=bytes_count, total_chars=char_count, total_docs=doc_count, compressed=compressed, @@ -217,6 +237,7 @@ def write_from_sources( def show_stats(self): print(f"Corpus directory: {self.corpus_dir}") + print(f"Total bytes: {self.total_bytes:,}") print(f"Total chars: {self.total_chars:,}") print(f"Total docs: {self.total_docs:,}") if self.stat_by_source: @@ -227,34 +248,34 @@ def from_sources( cls, corpus_dir: Union[str, Path], sources: Optional[dict] = None, # dict ds_name: weight, - chars_per_doc: int = 10_000, - max_chars: int = 1_000_000_000, + bytes_per_doc: int = 10_000, + max_bytes: int = 1_000_000_000, random_seed: int = RANDOM_SEED, split: str = "train", compressed: bool = False, - shard_size_chars: Optional[int] = None, + shard_size_bytes: Optional[int] = None, loader_fn: Optional[Callable] = None, # if provided, should be function that takes dataset config and returns iterator of text samples; overrides default loading from datasets library ): meta = None if loader_fn is not None: class CustomLoaderCorpus(TokenizerCorpus): # overwrite iterator to use custom loader - def iterator(self, max_chars: Optional[int] = None) -> Iterable[str]: + def iterator(self, max_bytes: Optional[int] = None) -> Iterable[str]: return loader_fn() - meta = CustomLoaderCorpus(corpus_dir=corpus_dir, total_chars=-1, total_docs=-1) + meta = CustomLoaderCorpus(corpus_dir=corpus_dir, total_bytes=-1, total_docs=-1) else: try: meta = cls.from_path(corpus_dir) - except: + except (FileNotFoundError, pickle.UnpicklingError, EOFError): meta = cls.write_from_sources( corpus_dir=corpus_dir, sources=sources, - chars_per_doc=chars_per_doc, - max_chars=max_chars, + bytes_per_doc=bytes_per_doc, + max_bytes=max_bytes, compressed=compressed, split=split, random_seed=random_seed, - shard_size_chars=shard_size_chars, + shard_size_bytes=shard_size_bytes, ) assert meta is not None, "Failed to create or load corpus metadata" return meta @@ -276,25 +297,27 @@ def shard_paths(self): def init_corpusdir(corpus_dir: Union[str, Path]): if isinstance(corpus_dir, str): corpus_dir = Path(corpus_dir) - if not corpus_dir.suffix == None: - corpus_dir = corpus_dir.with_suffix("") - # if not corpus_dir.suffix == ".txt": - # corpus_dir = corpus_dir.with_suffix(".txt") - # if compressed: - # corpus_dir = corpus_dir.with_suffix(".txt.zst") - if not corpus_dir.exists(): corpus_dir.mkdir(parents=True, exist_ok=True) return corpus_dir + +def weighted_sample_generator(sources, iters, prng, batch_size=10_000): + names = [src["name"] for src in sources] + weights = [src["weight"] for src in sources] + while True: + batch = prng.choices(names, weights=weights, k=batch_size) + for name in batch: + yield next(iters[name]), name def write_corpus_sample( sources = None, # dict ds_name: weight - chars_per_doc: int = 10_000, - max_chars: int = 1_000_000_000, - shard_size_chars: int = 1_000_000_000, + bytes_per_doc: int = 10_000, + max_bytes: int = 1_000_000_000, + shard_size_bytes: int = 1_000_000_000, per_dataset_normalizer: Optional[Callable] = None, corpus_dir: Path = DATA_DIR / "tokenizer_corpus", + temperature_alpha: Optional[float] = None, split: str = "train", show_progress: bool = True, random_seed: int = RANDOM_SEED, @@ -302,94 +325,105 @@ def write_corpus_sample( streaming: bool = True, shuffle: bool = True, ): - - if not sources: + if sources is None: sources = [ # base corpus for tokenizer training; mostly web text with some code and math - { "path": "HuggingFaceFW/fineweb-edu", "weight": 0.45 }, - { "path": "HuggingFaceTB/finemath", "weight": 0.2, "name": "finemath-4plus" }, - { "path": "codeparrot/codeparrot-clean", "weight": 0.2 }, + { "path": "HuggingFaceFW/fineweb-edu", "weight": 0.30 }, + { "path": "HuggingFaceTB/finemath", "weight": 0.15, "name": "finemath-4plus" }, + { "path": "codeparrot/codeparrot-clean", "weight": 0.15 }, ] + multilingual_weight = 0.4 + per_lang = multilingual_weight / len(_fineweb_2_names) for name in _fineweb_2_names: - sources.append({ "path": "HuggingFaceFW/fineweb-2", "weight": 0.15 / len(_fineweb_2_names), "name": name }) + sources.append({ + "path": "HuggingFaceFW/fineweb-2", + "weight": per_lang, + "name": name + }) # ronantakizawa/github-top-code with file_language="Python" ds = load_datasets(sources, split=split, random_seed=random_seed, streaming=streaming, shuffle=shuffle) - sources = [ { **src, "weight": src.get("weight", 1.0), "name": src["path"] + (f":{src.get('name', None)}" if src.get("name") else "") } for src in sources ] # ensure all sources have weight key - len_ds = sum(1 for _ in ds.values()) - len_src = sum(1 for _ in sources) - weights_sum = sum(src.get("weight", 1.0) for src in sources) - if not len_src == len_ds: - for src in sources: - src["weight"] = src.get("weight", 1.0) / weights_sum - print("Len sources vs loaded datasets:", len_src, len_ds) - print("Sum of dataset weights after adjustment:", sum(src.get("weight", 1.0) for src in sources)) - print(f"Dataset weights scaled by factor of 1 / {weights_sum:.2f} to match number of loaded datasets.") - r = random.Random(random_seed) - if max_chars == -1: - max_chars = sum(len(text) for subset in ds.values() for text in subset["text"]) - print(f"Calculated max_chars from datasets: {max_chars}") + sources = [ + { + **src, + "weight": src.get("weight", 1.0), + "name": src["path"] + ( + f":{src.get('name', None)}" if src.get("name") else "" + ) + } + for src in sources + ] # ensure all sources have weight key + if temperature_alpha is not None: + sources = apply_temperature_sampling(sources, alpha=temperature_alpha) + weights_sum = sum(src["weight"] for src in sources) + sources = _normalize_sources_weights(sources, weights_sum) + + def _make_source_weights(sources): + return { src["name"]: src["weight"] for src in sources } + + source_weights = _make_source_weights(sources) + + if max_bytes == -1: + max_bytes = sum(len(text.encode("utf-8")) for subset in ds.values() for text in subset["text"]) + print(f"Calculated max_bytes from datasets: {max_bytes}") total_chars = 0 + total_bytes = 0 total_docs = 0 shard_index = 0 - shard_chars = 0 + shard_bytes = 0 stat_by_source = { src["name"]: {"chars": 0, "docs": 0, "bytes": 0} for src in sources } + + def cycling_iterator(dataset): + while True: + yield from dataset def open_new_shard(idx): suffix = ".txt.zst" if compressed else ".txt" shard_path = (corpus_dir / f"shard_{idx:05d}").with_suffix(suffix) - # shard_path.mkdir(parents=True, exist_ok=True) f = open(shard_path, "w", encoding="utf-8", errors="ignore") return f - # cctx = zstd.ZstdCompressor(level=3) - # return cctx.stream_writer(f) - iters = { name: iter(subset) for name, subset in ds.items() } + r = random.Random(random_seed) + iters = { name: cycling_iterator(subset) for name, subset in ds.items() } writer = open_new_shard(shard_index) - with tqdm(total=max_chars, disable=not show_progress) as pbar: - while total_chars < max_chars: - p = r.random() - try: - for src in sources: - weight = src.get("weight", 1.0) - if p < weight: - sample = next(iters[src["name"]]) - break - else: - p -= weight - except StopIteration: - break + sampler = weighted_sample_generator(sources, iters, r) + + with tqdm(total=max_bytes, disable=not show_progress) as pbar: + while total_bytes < max_bytes: + sample, src_name = next(sampler) text = sample.get("text") or sample.get("content") or "" if not text.strip(): continue - total_docs += 1 - if src.get("name") == "codeparrot/codeparrot-clean": + if src_name == "codeparrot/codeparrot-clean": text = clean_codeparrot_example(text) - text = text[:chars_per_doc] # arbitrary truncation + text = safe_byte_truncate(text, bytes_per_doc) # arbitrary truncation + + if per_dataset_normalizer: - text = per_dataset_normalizer(text, dataset_name=src.get("name", src["name"])) # should be function(text, dataset_name) -> text + text = per_dataset_normalizer(text, dataset_name=src_name) # should be function(text, dataset_name) -> text if not text.strip(): continue - - encoded = text.encode("utf-8") - - writer.write(encoded.decode("utf-8")) + encoded = text.encode("utf-8") + total_chars += len(text) - shard_chars += len(text) + writer.write(text + "\n") + + total_bytes += len(encoded) + shard_bytes += len(encoded) total_docs += 1 - stat_by_source[src["name"]]["chars"] += len(text) - stat_by_source[src["name"]]["docs"] += 1 - stat_by_source[src["name"]]["bytes"] += len(encoded) + stat_by_source[src_name]["chars"] += len(text) + stat_by_source[src_name]["docs"] += 1 + stat_by_source[src_name]["bytes"] += len(encoded) pbar.update(len(encoded)) - if shard_chars >= shard_size_chars: + if shard_bytes >= shard_size_bytes: writer.close() shard_index += 1 - shard_chars = 0 + shard_bytes = 0 writer = open_new_shard(shard_index) writer.close() - return total_chars, total_docs, stat_by_source + return total_bytes, total_chars, total_docs, stat_by_source diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index e209dd9..822aac8 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -159,8 +159,6 @@ def __init__( self.special_tokens = special_tokens self.config = config self.bos_token_id = self.encode_special(config.special_tokens.bos) - if getattr(self, 'token_bytes', None) is None: - self.token_bytes = self.get_token_bytes() @classmethod def from_pretrained(cls, name: str, source: Optional[str] = None, special_tokens: Optional[SpecialTokens] = None): @@ -268,12 +266,12 @@ def get_bos_token_id(self): def train_from_iterator( cls, text_iterator: Iterable[str], - config: TokenizerTrainerConfig + config: TokenizerConfig, + tp: Optional[TokenizerTrainerConfig] = None ): special_tokens = config.special_tokens.list() vocab_size_no_special = config.vocab_size - len(special_tokens) # TODO: make the other tokenizers for comparison; lines +1 and +2 below are temporary - tp = getattr(config, "training_params", None) if tp is None: # Legacy fallback tp_trainer = config.trainer @@ -318,7 +316,7 @@ def train_from_iterator( special_tokens=special_tokens, config=config ) - to_save_flag = tp.to_save if tp is not None else getattr(config, "to_save", True) + to_save_flag = tp.to_save if tp is not None else getattr(config, "to_save", False) if to_save_flag: tokenizer.save_to_directory() return tokenizer @@ -397,12 +395,12 @@ def update_token_bytes(self): token_bytes_list.extend([0] * len(self.special_tokens)) new_token_bytes = torch.tensor(token_bytes_list, dtype=torch.int32, device="cpu") - old_vocab_size = getattr(self, "token_bytes", torch.tensor([])).numel() if hasattr(self, "token_bytes") else 0 - self.token_bytes = new_token_bytes + old_vocab_size = getattr(self, "_token_bytes", torch.tensor([])).numel() if hasattr(self, "_token_bytes") else 0 + self._token_bytes = new_token_bytes log0(f"Updated token bytes after truncation from {old_vocab_size:,} to {self.vocab_size:,}", logger=logger) # Save token_bytes to disk token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" - torch.save(self.token_bytes, token_bytes_path) + torch.save(self._token_bytes, token_bytes_path) # Persist tokenizer config/metadata try: self.config.save_to_directory() @@ -428,9 +426,7 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): # Save token bytes tensor token_bytes_path = directory / "token_bytes.pt" - if getattr(self, "token_bytes", None) is None: - self.token_bytes = self.get_token_bytes() - torch.save(self.token_bytes, token_bytes_path) + torch.save(self._token_bytes, token_bytes_path) # Write a lightweight JSON descriptor alongside the pickle config for readability config_json = { diff --git a/src/gpt_lab/tokenizer/truncation.py b/src/gpt_lab/tokenizer/truncation.py index 77f4b03..ecdd99e 100644 --- a/src/gpt_lab/tokenizer/truncation.py +++ b/src/gpt_lab/tokenizer/truncation.py @@ -12,6 +12,8 @@ from typing import Optional from pathlib import Path +from gpt_lab.utils.special_tokens import SpecialTokens + # Lightweight module: avoid importing heavy project modules at import time. # Logging is optional; use print() for informational messages here. diff --git a/src/gpt_lab/utils/schemas.py b/src/gpt_lab/utils/schemas.py index 0eda4fb..3ef6c6e 100644 --- a/src/gpt_lab/utils/schemas.py +++ b/src/gpt_lab/utils/schemas.py @@ -160,100 +160,57 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): pickle.dump(self, f) -class TokenizerTrainingParams(BaseModel): - """Encapsulate training-related parameters for tokenizer training. - - Phase 6: move training-specific params into a dedicated model to cleanly - separate tokenizer metadata from training-run configuration. - """ - max_chars: int = -1 - chars_per_doc: int = -1 - merges_per_pass: int = 512 - num_proc: int = -1 - trainer: Literal["tiktoken", "huggingface", "bpe", "fbpe", "rbpe", "dummy"] = "huggingface" - show_progress: bool = True - to_save: bool = True - - def model_post_init(self, context: Any) -> None: - # leave defaults; some values will be adjusted by TokenizerTrainerConfig - pass - - -class TokenizerTrainerConfig(TokenizerConfig): +class TokenizerTrainerConfig(BaseModel): model_config = ConfigDict( json_encoders={Path: str}, ) # Backwards-compatible placement of training params. New code should use # `training_params` to access training-related options. - training_params: TokenizerTrainingParams = Field(default_factory=TokenizerTrainingParams) # Keep legacy fields for compatibility; they'll be synced into training_params - max_chars: int = -1 - chars_per_doc: int = -1 + max_bytes: int = -1 + bytes_per_doc: int = -1 merges_per_pass: int = 512 # Only used for fbpe num_proc: int = -1 trainer: Literal["tiktoken", "huggingface", "bpe", "fbpe", "rbpe", "dummy"] = "huggingface" show_progress: bool = True to_save: bool = True - def model_post_init(self, context: Any) -> None: - super().model_post_init(context) - # Sync legacy fields into the new `training_params` container - self.training_params.max_chars = self.max_chars - self.training_params.chars_per_doc = self.chars_per_doc - self.training_params.merges_per_pass = self.merges_per_pass - self.training_params.num_proc = self.num_proc - self.training_params.trainer = self.trainer - self.training_params.show_progress = self.show_progress - self.training_params.to_save = self.to_save - - if self.training_params.trainer == "tiktoken" and self.pat_str == "": - log0("Using tiktoken trainer with an empty pat_str may lead to suboptimal tokenization. " - "Consider using a regex pattern for better tokenization performance.", level="warning", logger=logger) - - if self.training_params.max_chars == -1: - self.training_params.max_chars = int(self.vocab_size * 1000 * 2.5) - if self.training_params.chars_per_doc == -1: - self.training_params.chars_per_doc = self.training_params.max_chars // 1000 - if self.training_params.num_proc <= 0: - self.training_params.num_proc = min(32, (os.cpu_count() or 1) - 1) - def save_to_directory(self, directory: Optional[Union[str, Path]] = None): - if directory is not None: - if isinstance(directory, str): - directory = Path(directory) - else: - directory = self.dirname - config_path = directory / "config.pkl" - if not config_path.parent.exists(): - config_path.parent.mkdir(parents=True, exist_ok=True) - with open(str(config_path), "wb") as f: - pickle.dump(self, f) - - # TODO: consider saving with an other tool - json_path = TOKENIZERS_FOLDER / "tokenizers.json" - # df = df[df["name"] != self.name] # Remove existing entry if it exists - - new_row = { - "datetime": time.time(), - "name": self.name, - "vocab_size": self.vocab_size, - "special_tokens": len(self.special_tokens.list()), - "source": self.source, - "trainer": self.trainer, - "directory": str(directory), - "corpus_files": self.dircorpus if isinstance(self.dircorpus, str) else str(self.dircorpus), - "chars_per_doc": self.chars_per_doc, - "corpus_nb_chars": self.max_chars, - } - if json_path.exists(): - with open(json_path, "r") as f: - data = json.load(f) - else: - data = [] - data.append(new_row) - with open(json_path, "w") as f: - json.dump(data, f, indent=2) - + pass + # if directory is not None: + # if isinstance(directory, str): + # directory = Path(directory) + # else: + # directory = self.dirname + # config_path = directory / "config.pkl" + # if not config_path.parent.exists(): + # config_path.parent.mkdir(parents=True, exist_ok=True) + # with open(str(config_path), "wb") as f: + # pickle.dump(self, f) + + # # TODO: consider saving with an other tool + # json_path = TOKENIZERS_FOLDER / "tokenizers.json" + + # new_row = { + # "datetime": time.time(), + # "name": self.name, + # "vocab_size": self.vocab_size, + # "special_tokens": len(self.special_tokens.list()), + # "source": self.source, + # "trainer": self.trainer, + # "directory": str(directory), + # "corpus_files": self.dircorpus if isinstance(self.dircorpus, str) else str(self.dircorpus), + # "chars_per_doc": self.bytes_per_doc, + # "corpus_nb_chars": self.max_bytes, + # } + # if json_path.exists(): + # with open(json_path, "r") as f: + # data = json.load(f) + # else: + # data = [] + # data.append(new_row) + # with open(json_path, "w") as f: + # json.dump(data, f, indent=2) class DatasetConfig(BaseModel): name: str @@ -266,14 +223,6 @@ class DatasetConfig(BaseModel): sorted: bool = True max_shards: Optional[int] = None streaming: bool = False - # source: str - # split: Literal["train", "validation", "test"] - # seed: Optional[int] - # shard_size: Optional[int] - # num_shards: Optional[int] - # data_dir: Optional[Union[str,Path]] = DATA_DIR - # num_proc: Optional[int] - # stream: bool = True class DataLoaderConfig(BaseModel): batch_size: int = 1 diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 3cb1005..ed9afe3 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -446,9 +446,17 @@ def __init__(self, *args, **kwargs): # Expect mergeable ranks to be a dict mapping bytes to ints and include single-byte entries assert isinstance(out, dict) - # Check that merged pair "a","b" became bytes key - assert any(isinstance(k, (bytes, bytearray)) for k in out.keys()) - assert out.get(bytes([0])) == 0 + # Check that merged pairs produced exact byte keys and ranks + # Dummy merges: ["a","b"] -> b"ab" with rank 256, ["Ġx","y"] -> b" xy" with rank 257 + assert out.get(b"ab") == 256 + assert out.get(" xy".encode("utf-8")) == 257 + + # All single-byte tokens should be present and map to their own integer values + for i in range(256): + assert out.get(bytes([i])) == i + + # Total entries should equal 256 single-bytes + 2 merges + assert len(out) == 256 + 2 @pytest.mark.fast def test_compute_optimal_vocab_size_with_explicit_tokenizer_model(monkeypatch): From 2a1f57388843b6038df5fdd7af557a3f0f07457d Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Tue, 19 May 2026 15:15:34 +0200 Subject: [PATCH 10/18] tokenizer training config: integrates trainer parameters to tokenizer config --- README.md | 11 +- scripts/scaling_tokenizer.py | 23 ++-- src/gpt_lab/tokenizer/auto.py | 3 +- src/gpt_lab/tokenizer/corpus.py | 3 +- src/gpt_lab/tokenizer/hf.py | 8 +- src/gpt_lab/tokenizer/tokenizer.py | 36 +++--- src/gpt_lab/utils/schemas.py | 83 ++++--------- tests/test_tokenizer.py | 180 ++++++----------------------- 8 files changed, 105 insertions(+), 242 deletions(-) diff --git a/README.md b/README.md index 527c75d..8c25a1c 100644 --- a/README.md +++ b/README.md @@ -351,14 +351,19 @@ The tokenization implementation are located in [`gpt_lab.tokenizer`](./src/gpt_l ```python from gpt_lab.tokenizer import Tokenizer from gpt_lab.tokenizer.corpus import TokenizerCorpus -from gpt_lab.utils.schemas import TokenizerTrainerConfig +from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig # uses default corpus settings (mixture of HuggingFaceFW/fineweb-edu, HuggingFaceFW/fineweb-2, HuggingFaceTB/finemath and codeparrot/codeparrot-clean) corpus = TokenizerCorpus.from_sources(random_seed=42) -cfg = TokenizerTrainerConfig( +trainer_cfg = TokenizerTrainerConfig( + source="huggingface", # training backend (e.g., "huggingface", "tiktoken", "bpe", "fbpe", "rbpe", "dummy") + to_save=True, # pattern for pre-tokenization (e.g., "gpt2", "cl100k-base", etc., or regex pattern for custom pre-tokenization) +) +cfg = TokenizerConfig( name="my_tokenizer", vocab_size=32_000, - pat_str="gpt2", # pattern for pre-tokenization (e.g., "gpt2", "cl100k-base", etc., or regex pattern for custom pre-tokenization) + pat_str="gpt2", + trainer=trainer_cfg,# whether to save the trained tokenizer to disk ) tokenizer = Tokenizer.train_from_iterator(cfg, iterator=corpus.iterator()) ``` diff --git a/scripts/scaling_tokenizer.py b/scripts/scaling_tokenizer.py index 682b704..0041b78 100644 --- a/scripts/scaling_tokenizer.py +++ b/scripts/scaling_tokenizer.py @@ -1,5 +1,5 @@ from gpt_lab.tokenizer.tokenizer import Tokenizer -from gpt_lab.utils.schemas import TokenizerTrainerConfig +from gpt_lab.utils.schemas import TokenizerTrainerConfig, TokenizerConfig from gpt_lab.tokenizer.corpus import TokenizerCorpus from gpt_lab.utils.default import PAT_STR_GPT2, PAT_STR_GPT4, PAT_STR_punct, PAT_STR_cl100k_base, PAT_STR_o200k_base, TOKENIZERS_FOLDER, DATA_DIR @@ -182,21 +182,26 @@ def run_tokenizer_experiment(task): corpus_dir=corpus_path, sources=None, max_bytes=corpus_bytemax, - bytes_per_doc=corpus_bytemax // 10_000, + bytes_per_doc=corpus_bytemax // 20_000, random_seed=seed, ) - config = TokenizerTrainerConfig( + trainer_config = TokenizerTrainerConfig( max_bytes=max_bytes, - bytes_per_doc=max_bytes // 10_000, - vocab_size=vocab_size, - name=name, + bytes_per_doc=max_bytes // 20_000, num_proc=num_procs, - trainer="huggingface", - dircorpus=corpus_path, - pat_str=p_str, + source="huggingface", + dircorpus=corpus_path, # TODO: add CorpusConfig instead show_progress=False, to_save=False, ) + config = TokenizerConfig( + name=name, + vocab_size=vocab_size, + pat_str=p_str, + trainer=trainer_config, + source="huggingface", # this is quite dummy + # special_tokens=SpecialTokens(), # using default special tokens, adjust as needed + ) t0 = time.time() tokenizer = Tokenizer.train_from_iterator( text_iterator=corpus.iterator(max_bytes=max_bytes), diff --git a/src/gpt_lab/tokenizer/auto.py b/src/gpt_lab/tokenizer/auto.py index 4ecb7f9..578deee 100644 --- a/src/gpt_lab/tokenizer/auto.py +++ b/src/gpt_lab/tokenizer/auto.py @@ -107,7 +107,8 @@ def build_or_load_tokenizer(tname: Optional[str], vocab_size: int, train_tokeniz from gpt_lab.tokenizer.corpus import TokenizerCorpus _tname = base_name - trainer_cfg = TokenizerTrainerConfig(name=_tname, dirname=dirname or base_name, vocab_size=vocab_size, pat_str=pat_str, special_tokens=special_tokens) + trainer_cfg = TokenizerTrainerConfig() # cfg should be adapted + cfg = TokenizerConfig(name=_tname, source="huggingface", vocab_size=vocab_size, pat_str=pat_str, special_tokens=special_tokens, trainer=trainer_cfg) corpus = TokenizerCorpus.from_sources(corpus_dir=data_dir, max_chars=vocab_size * 4 * 100, random_seed=random_seed) tokenizer = Tokenizer.train_from_iterator(text_iterator=corpus.iterator(), config=trainer_cfg) return tokenizer diff --git a/src/gpt_lab/tokenizer/corpus.py b/src/gpt_lab/tokenizer/corpus.py index d1cf66f..5ac6e4d 100644 --- a/src/gpt_lab/tokenizer/corpus.py +++ b/src/gpt_lab/tokenizer/corpus.py @@ -1,3 +1,4 @@ +from __future__ import annotations from pathlib import Path import random, pickle from gpt_lab.utils.default import RANDOM_SEED, DATA_DIR @@ -255,7 +256,7 @@ def from_sources( compressed: bool = False, shard_size_bytes: Optional[int] = None, loader_fn: Optional[Callable] = None, # if provided, should be function that takes dataset config and returns iterator of text samples; overrides default loading from datasets library - ): + ) -> TokenizerCorpus: meta = None if loader_fn is not None: class CustomLoaderCorpus(TokenizerCorpus): diff --git a/src/gpt_lab/tokenizer/hf.py b/src/gpt_lab/tokenizer/hf.py index 2b46a79..137f920 100644 --- a/src/gpt_lab/tokenizer/hf.py +++ b/src/gpt_lab/tokenizer/hf.py @@ -85,11 +85,14 @@ def save(self, tokenizer_dir: str): self.main.save(tokenizer_path) -def train_huggingface_from_iterator(text_iterator: Iterable[str], config: TokenizerTrainerConfig) -> Dict[bytes, int]: +def train_huggingface_from_iterator(text_iterator: Iterable[str], config: TokenizerConfig) -> Dict[bytes, int]: """Train a HuggingFace BPE tokenizer and return mergeable_ranks mapping. Returns a dict mapping byte-strings to ranks (integers). """ + tr_config = getattr(config, "trainer", None) + if tr_config is None: + log_error("TokenizerConfig must have a 'trainer' attribute with training parameters for HuggingFace tokenizer training.", logger=logger, error_type=ValueError) if HFTokenizer is None: log_error("tokenizers library is required for HuggingFace trainer", logger=logger, error_type=ImportError) @@ -111,7 +114,6 @@ def train_huggingface_from_iterator(text_iterator: Iterable[str], config: Tokeni initial_alphabet = pre_tokenizers.ByteLevel.alphabet() # Prefer training-specific params container when available - tp = getattr(config, "training_params", None) vocab_size_no_special = config.vocab_size - len(config.special_tokens.list()) trainer = BpeTrainer( vocab_size=vocab_size_no_special, @@ -120,7 +122,7 @@ def train_huggingface_from_iterator(text_iterator: Iterable[str], config: Tokeni initial_alphabet=initial_alphabet, special_tokens=[], ) - trainer.show_progress = tp.show_progress if tp is not None else config.show_progress + trainer.show_progress = tr_config.show_progress tknzr.train_from_iterator(iterator=text_iterator, trainer=trainer) merges = json.loads(tknzr.to_str())["model"]["merges"] diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 822aac8..be16a14 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -16,7 +16,7 @@ ) from pathlib import Path -from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig +from gpt_lab.utils.schemas import TokenizerConfig from gpt_lab.utils.default import TOKENIZERS_FOLDER from gpt_lab.utils.special_tokens import SpecialTokens from gpt_lab.utils.logging import log0, log_error @@ -267,56 +267,52 @@ def train_from_iterator( cls, text_iterator: Iterable[str], config: TokenizerConfig, - tp: Optional[TokenizerTrainerConfig] = None ): special_tokens = config.special_tokens.list() vocab_size_no_special = config.vocab_size - len(special_tokens) # TODO: make the other tokenizers for comparison; lines +1 and +2 below are temporary - if tp is None: - # Legacy fallback - tp_trainer = config.trainer - else: - tp_trainer = tp.trainer - - if tp_trainer != "huggingface": - msg = f"Training tokenizer with trainer {tp_trainer!r} is not implemented yet. Please use 'huggingface' trainer for now." + tp_trainer = config.trainer + if tp_trainer is None: + log_error("TokenizerConfig.trainer is not set. Please set the trainer explicitly in TokenizerConfig.", logger=logger, error_type=UserWarning) + if tp_trainer.source != "huggingface": + msg = f"Training tokenizer with trainer {tp_trainer.source!r} is not implemented yet. Please use 'huggingface' trainer for now." log_error(msg, error_type=NotImplementedError, logger=logger) # TODO: make pretokenizer here -> options: 1. gpt2, 2. custom - if tp_trainer == "tiktoken": + if tp_trainer.source == "tiktoken": from tiktoken._educational import bpe_train log0("Training tokenizer with tiktoken is a TODO for future improvement.", level="warning", logger=logger) # TODO: WIP, not tested yet mergeable_ranks = bpe_train(data=text_iterator, vocab_size=vocab_size_no_special, pat_str=config.pat_str) - elif tp_trainer == "huggingface": + elif tp_trainer.source == "huggingface": # Delegate HuggingFace training logic to tokenizer.hf module mergeable_ranks = train_huggingface_from_iterator(text_iterator, config) # TODO: add other trainer options (bpe, rust bpe, fast bpe...) # The following options are placeholders for future impl. - elif tp_trainer in ["bpe", "fbpe", "rbpe"]: - raise NotImplementedError(f"Tokenizer training mode {tp_trainer!r} is not yet implemented. Please use 'huggingface' mode.") - elif tp_trainer == "bpe": + elif tp_trainer.source in ["bpe", "fbpe", "rbpe"]: + raise NotImplementedError(f"Tokenizer training mode {tp_trainer.source!r} is not yet implemented. Please use 'huggingface' mode.") + elif tp_trainer.source == "bpe": # naive python implementation of byte-level BPE, not optimized for large corpora, but serves as a reference from gpt_lab.tokenizer.bpe import bpe _, mergeable_ranks = bpe() - elif tp_trainer == "fbpe": + elif tp_trainer.source == "fbpe": from gpt_lab.tokenizer.bpe import bpe_fast trainer = ... - elif tp_trainer == "rbpe": + elif tp_trainer.source == "rbpe": from rbpe import bpe ... - elif tp_trainer == "dummy": + elif tp_trainer.source == "dummy": log0("Using DummyTokenizer for training, this is not a real tokenizer and should only be used for testing purposes.", level="warning", logger=logger) return cls(DummyTokenizer(config), config) else: - msg = f"Tokenizer trainer {tp_trainer!r} is not supported." + msg = f"Tokenizer trainer {tp_trainer.source!r} is not supported." log_error(msg, error_type=NotImplementedError, logger=logger) tokenizer = cls( mergeable_ranks=mergeable_ranks, special_tokens=special_tokens, config=config ) - to_save_flag = tp.to_save if tp is not None else getattr(config, "to_save", False) + to_save_flag = tp_trainer.to_save if tp_trainer is not None else getattr(config, "to_save", False) if to_save_flag: tokenizer.save_to_directory() return tokenizer diff --git a/src/gpt_lab/utils/schemas.py b/src/gpt_lab/utils/schemas.py index 3ef6c6e..c204199 100644 --- a/src/gpt_lab/utils/schemas.py +++ b/src/gpt_lab/utils/schemas.py @@ -84,18 +84,38 @@ def local_heads_kv(self) -> int: raise ValueError("n_heads_kv is not set for TensorParallelConfig") return self.n_heads_kv // self.tp_size +class TokenizerTrainerConfig(BaseModel): + model_config = ConfigDict( + json_encoders={Path: str}, + ) + # Backwards-compatible placement of training params. New code should use + # `training_params` to access training-related options. + # Keep legacy fields for compatibility; they'll be synced into training_params + max_bytes: int = -1 + bytes_per_doc: int = -1 + merges_per_pass: int = 512 # Only used for fbpe + num_proc: int = -1 + source: Literal["tiktoken", "huggingface", "bpe", "fbpe", "rbpe", "dummy"] = "huggingface" + show_progress: bool = True + to_save: bool = True + + dircorpus: Optional[Union[str, Path]] = None + + # corpus: CorpusConfig = Field(default_factory=CorpusConfig) # TODO: for reproducity + class TokenizerConfig(BaseModel): model_config = ConfigDict( json_encoders={Path: str}, ) name: str = "ic1_tok" dirname: Union[str, Path] = TOKENIZERS_FOLDER - dircorpus: Optional[Union[str, Path]] = None vocab_size: int = VOCAB_SIZE pat_str: str = "gpt4" special_tokens: Optional[SpecialTokens] = Field(default_factory=SpecialTokens) source: TokenizerSources = "tiktoken" + trainer: Optional[TokenizerTrainerConfig] = None + def model_post_init(self, context: Any) -> None: if self.pat_str in PAT_STR.keys(): self.pat_str = PAT_STR.get(self.pat_str) # Use predefined pattern if pat_str is a key in PAT_STR @@ -110,11 +130,7 @@ def model_post_init(self, context: Any) -> None: self.dirname = Path(self.dirname) cleaned_name = self.name.split("/")[-1] # Remove leading/trailing slashes if not self.dirname.name == cleaned_name: # add model name to path if not already included - self.dirname = self.dirname / cleaned_name - if self.dircorpus is not None and isinstance(self.dircorpus, str): - self.dircorpus = Path(self.dircorpus) - if not self.dirname.exists(): - self.dirname.mkdir(parents=True, exist_ok=False) + self.dirname = self.dirname / cleaned_name def get_mergeable_ranks(self) -> dict: if not self.dirname.exists(): @@ -154,64 +170,11 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): if not directory.name == cleaned_name: # add model name to path if not already included directory = directory / cleaned_name config_path = directory / "config.pkl" - config_path.mkdir(parents=True, exist_ok=False) + config_path.mkdir(parents=True, exist_ok=True) with open(str(config_path), "wb") as f: pickle.dump(self, f) - -class TokenizerTrainerConfig(BaseModel): - model_config = ConfigDict( - json_encoders={Path: str}, - ) - # Backwards-compatible placement of training params. New code should use - # `training_params` to access training-related options. - # Keep legacy fields for compatibility; they'll be synced into training_params - max_bytes: int = -1 - bytes_per_doc: int = -1 - merges_per_pass: int = 512 # Only used for fbpe - num_proc: int = -1 - trainer: Literal["tiktoken", "huggingface", "bpe", "fbpe", "rbpe", "dummy"] = "huggingface" - show_progress: bool = True - to_save: bool = True - - def save_to_directory(self, directory: Optional[Union[str, Path]] = None): - pass - # if directory is not None: - # if isinstance(directory, str): - # directory = Path(directory) - # else: - # directory = self.dirname - # config_path = directory / "config.pkl" - # if not config_path.parent.exists(): - # config_path.parent.mkdir(parents=True, exist_ok=True) - # with open(str(config_path), "wb") as f: - # pickle.dump(self, f) - - # # TODO: consider saving with an other tool - # json_path = TOKENIZERS_FOLDER / "tokenizers.json" - - # new_row = { - # "datetime": time.time(), - # "name": self.name, - # "vocab_size": self.vocab_size, - # "special_tokens": len(self.special_tokens.list()), - # "source": self.source, - # "trainer": self.trainer, - # "directory": str(directory), - # "corpus_files": self.dircorpus if isinstance(self.dircorpus, str) else str(self.dircorpus), - # "chars_per_doc": self.bytes_per_doc, - # "corpus_nb_chars": self.max_bytes, - # } - # if json_path.exists(): - # with open(json_path, "r") as f: - # data = json.load(f) - # else: - # data = [] - # data.append(new_row) - # with open(json_path, "w") as f: - # json.dump(data, f, indent=2) - class DatasetConfig(BaseModel): name: str hfkwargs: dict = Field(default_factory=dict) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index ed9afe3..8fc9219 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -13,6 +13,7 @@ from gpt_lab.tokenizer.truncation import parse_truncated_name, truncated_from_pretrained from gpt_lab.utils.special_tokens import SpecialTokens from gpt_lab.tokenizer import Tokenizer, TokenizerConfig +from gpt_lab.utils.schemas import TokenizerTrainerConfig from gpt_lab.tokenizer import hf as tokenizer_hf from gpt_lab.tokenizer.corpus import TokenizerCorpus import gpt_lab.tokenizer.auto as tokenizer_auto @@ -46,7 +47,7 @@ def dummy_large(): @pytest.mark.fast def test_tokenizer(dummy_small, dummy_large): special_tokens = SpecialTokens() - config = TokenizerConfig(name="gpt2", vocab_size=-1, max_context=16, pat_str="gpt2", source="tiktoken", special_tokens=special_tokens) + config = TokenizerConfig(name="gpt2", vocab_size=-1, pat_str="gpt2", source="tiktoken", special_tokens=special_tokens) tokenizer = Tokenizer.from_config(config=config) # Test small dataset @@ -67,7 +68,7 @@ def test_tokenizer(dummy_small, dummy_large): def test_train_tokenizer(dummy_small): special_tokens = SpecialTokens() corpus = list(dummy_small) # Convert generator to list for multiple iterations - config = TokenizerConfig(name="gpt2", vocab_size=-1, max_context=16, pat_str="gpt2", source="tiktoken", special_tokens=special_tokens) + config = TokenizerConfig(name="gpt2", vocab_size=-1, pat_str="gpt2", source="tiktoken", special_tokens=special_tokens) tokenizer = Tokenizer.from_config(config=config) # Simulate training by encoding and decoding the same sample multiple times @@ -278,25 +279,19 @@ def test_build_or_load_tokenizer_notrain_uses_pretrained(monkeypatch): @pytest.mark.fast def test_build_or_load_tokenizer_training_path(monkeypatch): import gpt_lab.tokenizer.auto as tokenizer_auto - created_cfg = {} - - class FakeTrainerCfg: - def __init__(self, **kwargs): - created_cfg.update(kwargs) + + sentinel = object() class FakeCorpus: def iterator(self): return iter(["abc", "def"]) - sentinel = object() - - monkeypatch.setattr(tokenizer_auto, "TokenizerTrainerConfig", FakeTrainerCfg) monkeypatch.setattr( - "gpt_lab.tokenizer.auto.TokenizerCorpus.from_sources", + "gpt_lab.tokenizer.corpus.TokenizerCorpus.from_sources", lambda **_kwargs: FakeCorpus(), ) monkeypatch.setattr( - "gpt_lab.tokenizer.auto.Tokenizer.train_from_iterator", + "gpt_lab.tokenizer.tokenizer.Tokenizer.train_from_iterator", lambda text_iterator, config: sentinel, ) @@ -312,10 +307,8 @@ def iterator(self): dirname="/tmp/tokdir", ) + # The function should return the result from train_from_iterator assert out is sentinel - assert created_cfg["name"] == "my_tok" - assert created_cfg["vocab_size"] == 4096 - assert created_cfg["dirname"] == "/tmp/tokdir" @pytest.mark.fast @@ -323,16 +316,19 @@ def test_train_huggingface_from_iterator_requires_tokenizers(monkeypatch): import gpt_lab.tokenizer.hf as tokenizer_hf monkeypatch.setattr(tokenizer_hf, "HFTokenizer", None) - cfg = type( - "DummyCfg", - (), - { - "vocab_size": 300, - "pat_str": "gpt2", - "show_progress": False, - "special_tokens": type("DummyST", (), {"list": lambda self: ["<|bos|>"]})(), - }, - )() + special_tokens = SpecialTokens() + trainer_cfg = TokenizerTrainerConfig( + source="huggingface", + show_progress=False, + ) + cfg = TokenizerConfig( + name="test", + vocab_size=300, + pat_str="gpt2", + special_tokens=special_tokens, + source="huggingface", + trainer=trainer_cfg, + ) with pytest.raises(Exception): tokenizer_hf.train_huggingface_from_iterator(["hello"], cfg) @@ -439,8 +435,20 @@ def __init__(self, *args, **kwargs): monkeypatch.setattr(tokenizer_hf, "decoders", DummyDecoders) monkeypatch.setattr(tokenizer_hf, "BpeTrainer", DummyBpeTrainer) - # Minimal trainer config - cfg = type("DummyCfg", (), {"vocab_size": 300, "pat_str": "\\w+", "show_progress": False, "special_tokens": SpecialTokens()})() + # Create a proper TokenizerConfig with trainer + special_tokens = SpecialTokens() + trainer_cfg = TokenizerTrainerConfig( + source="huggingface", + show_progress=False, + ) + cfg = TokenizerConfig( + name="test", + vocab_size=300, + pat_str="\\w+", + special_tokens=special_tokens, + source="huggingface", + trainer=trainer_cfg, + ) out = tokenizer_hf.train_huggingface_from_iterator(["hello world"], cfg) @@ -457,121 +465,3 @@ def __init__(self, *args, **kwargs): # Total entries should equal 256 single-bytes + 2 merges assert len(out) == 256 + 2 - -@pytest.mark.fast -def test_compute_optimal_vocab_size_with_explicit_tokenizer_model(monkeypatch): - class DummyTokenizer: - vocab_size = 777 - - monkeypatch.setattr( - tokenizer_auto.Tokenizer, - "from_pretrained", - lambda name: DummyTokenizer(), - ) - - out = tokenizer_auto.compute_optimal_vocab_size( - depth=4, - aspect_ratio=16, - train_tokenizer=False, - tokenizer_model="gpt2", - special_tokens=SpecialTokens(), - ) - assert out == 777 - - -@pytest.mark.fast -def test_compute_optimal_vocab_size_raises_when_too_small(monkeypatch): - class DummyMetaModel: - n_params = 1 - - import gpt_lab.model.checkpoint as mcheck - monkeypatch.setattr(mcheck, "build_meta_model", lambda _cfg: DummyMetaModel()) - - with pytest.raises(ValueError, match="<256"): - tokenizer_auto.compute_optimal_vocab_size( - depth=2, - aspect_ratio=8, - train_tokenizer=False, - tokenizer_model=None, - special_tokens=SpecialTokens(), - get_closest=lambda _x: ("tiny", 128), - ) - - -@pytest.mark.fast -def test_resolve_tokenizer_explicit_or_auto(monkeypatch): - monkeypatch.setattr( - tokenizer_auto, - "get_closest_tokenizer_size", - lambda _vocab_size: ("cl100k_base", 100000), - ) - - assert tokenizer_auto.resolve_tokenizer("gpt2", 32000, SpecialTokens()) == "gpt2" - assert tokenizer_auto.resolve_tokenizer(None, 32000, SpecialTokens()) == "cl100k_base" - assert tokenizer_auto.resolve_tokenizer("auto", 32000, SpecialTokens()) == "cl100k_base" - - -@pytest.mark.fast -def test_build_or_load_tokenizer_notrain_uses_pretrained(monkeypatch): - sentinel = object() - monkeypatch.setattr( - tokenizer_auto.Tokenizer, - "from_pretrained", - lambda _name: sentinel, - ) - - out = tokenizer_auto.build_or_load_tokenizer( - tname="gpt2", - vocab_size=32000, - train_tokenizer=False, - base_name="unused", - pat_str="gpt2", - special_tokens=SpecialTokens(), - data_dir="unused", - random_seed=42, - ) - assert out is sentinel - - -@pytest.mark.fast -def test_build_or_load_tokenizer_training_path(monkeypatch): - created_cfg = {} - - class FakeTrainerCfg: - def __init__(self, **kwargs): - created_cfg.update(kwargs) - - class FakeCorpus: - def iterator(self): - return iter(["abc", "def"]) - - sentinel = object() - - monkeypatch.setattr(tokenizer_auto, "TokenizerTrainerConfig", FakeTrainerCfg) - monkeypatch.setattr( - TokenizerCorpus, - "from_sources", - lambda **_kwargs: FakeCorpus(), - ) - monkeypatch.setattr( - tokenizer_auto.Tokenizer, - "train_from_iterator", - lambda text_iterator, config: sentinel, - ) - - out = tokenizer_auto.build_or_load_tokenizer( - tname=None, - vocab_size=4096, - train_tokenizer=True, - base_name="my_tok", - pat_str="gpt2", - special_tokens=SpecialTokens(), - data_dir="/tmp/corpus", - random_seed=7, - dirname="/tmp/tokdir", - ) - - assert out is sentinel - assert created_cfg["name"] == "my_tok" - assert created_cfg["vocab_size"] == 4096 - assert created_cfg["dirname"] == "/tmp/tokdir" From 05f986efd4f610af7ab57869ddf8c5d7fbc39206 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Tue, 19 May 2026 17:00:02 +0200 Subject: [PATCH 11/18] tokenizer: some function calls fixes --- scripts/scaling_tokenizer.py | 29 +++++++++++++++++++++-------- src/gpt_lab/tokenizer/auto.py | 4 ++-- src/gpt_lab/tokenizer/base.py | 5 +++-- src/gpt_lab/tokenizer/corpus.py | 17 +++++++++++++---- src/gpt_lab/tokenizer/hf.py | 3 ++- src/gpt_lab/utils/schemas.py | 1 + 6 files changed, 42 insertions(+), 17 deletions(-) diff --git a/scripts/scaling_tokenizer.py b/scripts/scaling_tokenizer.py index 0041b78..f652972 100644 --- a/scripts/scaling_tokenizer.py +++ b/scripts/scaling_tokenizer.py @@ -1,7 +1,11 @@ +from gpt_lab.utils.common import get_banner +get_banner(to_print=True) + from gpt_lab.tokenizer.tokenizer import Tokenizer from gpt_lab.utils.schemas import TokenizerTrainerConfig, TokenizerConfig from gpt_lab.tokenizer.corpus import TokenizerCorpus from gpt_lab.utils.default import PAT_STR_GPT2, PAT_STR_GPT4, PAT_STR_punct, PAT_STR_cl100k_base, PAT_STR_o200k_base, TOKENIZERS_FOLDER, DATA_DIR +from gpt_lab.utils.logging import log0 from pathlib import Path import argparse, pickle, zipfile @@ -11,8 +15,15 @@ import regex as re +import logging +logger = logging.getLogger(__name__) + BASELINES = ["gpt2", "cl100k_base", "o200k_base"] +# easy +def print(*args, **kwargs): + log0(" ".join(str(arg) for arg in args), **kwargs, logger=logger) + def load_all_results(path): results = [] with open(path, "rb") as f: @@ -200,6 +211,7 @@ def run_tokenizer_experiment(task): pat_str=p_str, trainer=trainer_config, source="huggingface", # this is quite dummy + save_token_bytes=False, # we will compute token bytes on the fly without saving to disk to avoid IO overhead, adjust as needed based on your use case and whether you want to inspect the token bytes files # special_tokens=SpecialTokens(), # using default special tokens, adjust as needed ) t0 = time.time() @@ -258,13 +270,13 @@ def main(): with open(backup_path, "rb") as f: results = pickle.load(f) if results == [] or results is None: - print(f"Existing results file {backup_path} is empty. It will be overwritten with new results.") + log0(f"Existing results file {backup_path} is empty. It will be overwritten with new results.", logger=logger) break new_name = results_path.stem + f"_{i}" backup_path = backup_path.with_stem(new_name) i += 1 results_path.rename(backup_path) - print(f"Existing results file found. Renamed to {backup_path!r} to avoid overwriting. New results will be stored in {results_path!r}.") + log0(f"Existing results file found. Renamed to {backup_path!r} to avoid overwriting. New results will be stored in {results_path!r}.", logger=logger) def store_results(results_batch, path=results_path): @@ -311,18 +323,19 @@ def store_results(results_batch, path=results_path): # Corpus size varying with different vocab_sizes and split patterns - patterns = { "pat_str-gpt2": PAT_STR_GPT2, "pat_str-gpt4": PAT_STR_GPT4, "pat_str-punct": PAT_STR_punct, "pat_str-cl100k_base": PAT_STR_cl100k_base, "pat_str-o200k_base": PAT_STR_o200k_base } + # patterns = { "pat_str-gpt2": PAT_STR_GPT2, "pat_str-gpt4": PAT_STR_GPT4, "pat_str-punct": PAT_STR_punct, "pat_str-cl100k_base": PAT_STR_cl100k_base, "pat_str-o200k_base": PAT_STR_o200k_base } + patterns = { "pat_str-gpt4": PAT_STR_GPT4 } # patterns = { "PAT_STR_o200k_base": PAT_STR_o200k_base } # TODO: optimize by running the biggest vocab size and slice it on top-k merges for smaller vocabs # vocab_sizes = [10_000, 20_000, 30_000, 50_000, 100_000, 200_000, 300_000, 500_000] - vocab_sizes = [30_000, 50_000, 70_000, 100_000, 200_000] + vocab_sizes = [50_000, 70_000, 100_000, 200_000] # vocab_sizes = list(reversed(vocab_sizes)) - _max_char_runs = 8 # adjust the divisor to control how many runs are done before storing results to disk, this is a trade-off between memory usage and frequency of saving intermediate results. With 3 processes, we can afford to do more runs before saving, but if you have more memory constraints, you might want to save more frequently by using a smaller divisor. - max_bytes = lambda vocab_size: [int(vocab_size * i * 512) for i in range(1, _max_char_runs+1)] # ~3.5 characters per token on average, adjust as needed based on your corpus + _max_char_runs = 16 # adjust the divisor to control how many runs are done before storing results to disk, this is a trade-off between memory usage and frequency of saving intermediate results. With 3 processes, we can afford to do more runs before saving, but if you have more memory constraints, you might want to save more frequently by using a smaller divisor. + max_bytes = lambda vocab_size: [int(vocab_size * i * 512) for i in range(1, _max_char_runs+1, 2)] # ~3.5 characters per token on average, adjust as needed based on your corpus # Two options: same name for all tokenizers -> overwrite / different names -> many tokenizers on disk, consider cleaning up after training or implementing a caching mechanism to avoid retraining the same tokenizer multiple times. # name = lambda vocab_size, max_char, p_str_name: f"ic1-tok-{int(vocab_size//1000)}k_maxchar-{max_char//1e6:.1f}M_pattern-{p_str_name}" - print(f"Using {num_procs} processes for tokenizer training.") + log0(f"Using {num_procs} processes for tokenizer training.") corpus_path = DATA_DIR / "corpus" / results_path.stem results = [] corpus_bytemax = max(max_bytes(max(vocab_sizes))) @@ -367,7 +380,7 @@ def store_results(results_batch, path=results_path): from concurrent.futures import ProcessPoolExecutor, as_completed mp.set_start_method("spawn", force=True) - max_workers = min(os.cpu_count(), 8) # be conservative + max_workers = min(os.cpu_count(), 4) # be conservative results = [] # tasks_chunks = [tasks[i:i + max_workers] for i in range(0, len(tasks), _max_char_runs)] # for chunk in tqdm(tasks_chunks, desc="Processing task chunks"): diff --git a/src/gpt_lab/tokenizer/auto.py b/src/gpt_lab/tokenizer/auto.py index 578deee..0bad08e 100644 --- a/src/gpt_lab/tokenizer/auto.py +++ b/src/gpt_lab/tokenizer/auto.py @@ -109,6 +109,6 @@ def build_or_load_tokenizer(tname: Optional[str], vocab_size: int, train_tokeniz _tname = base_name trainer_cfg = TokenizerTrainerConfig() # cfg should be adapted cfg = TokenizerConfig(name=_tname, source="huggingface", vocab_size=vocab_size, pat_str=pat_str, special_tokens=special_tokens, trainer=trainer_cfg) - corpus = TokenizerCorpus.from_sources(corpus_dir=data_dir, max_chars=vocab_size * 4 * 100, random_seed=random_seed) - tokenizer = Tokenizer.train_from_iterator(text_iterator=corpus.iterator(), config=trainer_cfg) + corpus = TokenizerCorpus.from_sources(corpus_dir=data_dir, max_bytes=vocab_size * 4 * 100, random_seed=random_seed) + tokenizer = Tokenizer.train_from_iterator(text_iterator=corpus.iterator(), config=cfg) return tokenizer diff --git a/src/gpt_lab/tokenizer/base.py b/src/gpt_lab/tokenizer/base.py index 701b07c..f9c9d4c 100644 --- a/src/gpt_lab/tokenizer/base.py +++ b/src/gpt_lab/tokenizer/base.py @@ -71,8 +71,9 @@ def get_token_bytes(self): # Special tokens are always zero-length for token_bytes token_bytes_list.extend([0] * len(self.special_tokens)) token_bytes = torch.tensor(token_bytes_list, dtype=torch.int32, device="cpu") - with open(token_bytes_path, "wb") as f: - torch.save(token_bytes, f) + if self.config.save_token_bytes: + with open(token_bytes_path, "wb") as f: + torch.save(token_bytes, f) log0(f"Saved token_bytes to {token_bytes_path}", logger=logger) self._token_bytes = token_bytes diff --git a/src/gpt_lab/tokenizer/corpus.py b/src/gpt_lab/tokenizer/corpus.py index 5ac6e4d..87c8155 100644 --- a/src/gpt_lab/tokenizer/corpus.py +++ b/src/gpt_lab/tokenizer/corpus.py @@ -172,9 +172,11 @@ def iterator(self, max_bytes: Optional[int] = None) -> Iterable[str]: for shard in self.shard_paths(): with self.open_text_file(shard) as f: for line in f: - yield line.strip() - byte_count += len(line) - if max_bytes and byte_count >= max_bytes: + byte_count += len(line.encode("utf-8")) + + yield line.rstrip("\n") + + if max_bytes is not None and byte_count >= max_bytes: return @staticmethod @@ -267,7 +269,14 @@ def iterator(self, max_bytes: Optional[int] = None) -> Iterable[str]: else: try: meta = cls.from_path(corpus_dir) - except (FileNotFoundError, pickle.UnpicklingError, EOFError): + except ( + FileNotFoundError, + pickle.PickleError, + EOFError, + ValueError, + AttributeError, + ModuleNotFoundError, + ): meta = cls.write_from_sources( corpus_dir=corpus_dir, sources=sources, diff --git a/src/gpt_lab/tokenizer/hf.py b/src/gpt_lab/tokenizer/hf.py index 137f920..25c5737 100644 --- a/src/gpt_lab/tokenizer/hf.py +++ b/src/gpt_lab/tokenizer/hf.py @@ -114,7 +114,8 @@ def train_huggingface_from_iterator(text_iterator: Iterable[str], config: Tokeni initial_alphabet = pre_tokenizers.ByteLevel.alphabet() # Prefer training-specific params container when available - vocab_size_no_special = config.vocab_size - len(config.special_tokens.list()) + _special_tokens = list(config.special_tokens) or [] + vocab_size_no_special = config.vocab_size - len(config.special_tokens) trainer = BpeTrainer( vocab_size=vocab_size_no_special, show_progress=True, diff --git a/src/gpt_lab/utils/schemas.py b/src/gpt_lab/utils/schemas.py index c204199..ab5524f 100644 --- a/src/gpt_lab/utils/schemas.py +++ b/src/gpt_lab/utils/schemas.py @@ -114,6 +114,7 @@ class TokenizerConfig(BaseModel): special_tokens: Optional[SpecialTokens] = Field(default_factory=SpecialTokens) source: TokenizerSources = "tiktoken" + save_token_bytes: bool = True trainer: Optional[TokenizerTrainerConfig] = None def model_post_init(self, context: Any) -> None: From 8c587cdf2173ec3b72f75384e98b7abe60dfeb6d Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Wed, 20 May 2026 12:22:20 +0200 Subject: [PATCH 12/18] tokenizer: fix sp token in truncated tokenizer + scaling tok header + move to benchmark folder --- .../dataloaders.py} | 0 .../tokenizer_corpus_size.py} | 64 ++++++++++++++++--- src/gpt_lab/tokenizer/tokenizer.py | 6 +- src/gpt_lab/tokenizer/truncation.py | 13 +--- 4 files changed, 61 insertions(+), 22 deletions(-) rename scripts/{benchmark_dataloaders.py => benchmark/dataloaders.py} (100%) rename scripts/{scaling_tokenizer.py => benchmark/tokenizer_corpus_size.py} (83%) diff --git a/scripts/benchmark_dataloaders.py b/scripts/benchmark/dataloaders.py similarity index 100% rename from scripts/benchmark_dataloaders.py rename to scripts/benchmark/dataloaders.py diff --git a/scripts/scaling_tokenizer.py b/scripts/benchmark/tokenizer_corpus_size.py similarity index 83% rename from scripts/scaling_tokenizer.py rename to scripts/benchmark/tokenizer_corpus_size.py index f652972..1e21cdc 100644 --- a/scripts/scaling_tokenizer.py +++ b/scripts/benchmark/tokenizer_corpus_size.py @@ -1,5 +1,48 @@ -from gpt_lab.utils.common import get_banner -get_banner(to_print=True) +""" +# Tokenizer Corpus Size Benchmarking Script + +Full recipe for training and scaling tokenizer with different corpus sizes, vocabulary sizes, patterns, +and evaluating the trained tokenizers on a simple test set to analyze the trade-offs between corpus size, +vocabulary size, training time, and tokenization quality. + +There is similar study on studying the optimal corpus size for training a BPE tokenizer as: +- Reddy et al., "How Much is Enough? The Diminishing Returns of Tokenization Training Data", + +However, this study is focused on training a BPE tokenizer with a specific size. Here, we want to analyze the trade-offs +between corpus size, vocabulary size, and tokenization quality, and also compare with truncated versions of baseline +tokenizers to see how much of the performance can be retained with a smaller vocabulary size. + +This is mainly motivated by the following facts: +- Language model have been scaled up but tokenizers sizes have not been scaled up as much, and it is not clear how much the tokenizer performance can be improved by scaling up the tokenizer training corpus and vocabulary size. +- According to [2], Language model performance is sensitive to tokenizer size, and the optimal size is often larger than the commonly used 50k tokens, especially for larger models and more diverse corpora. + +## Usage + +How to run it from root directory of the repo: + +- Make a new scaling run with new corpus sizes: + + +[!NOTE] +Recommended: run with `--optim-config-path=configs/optim.yaml` argument. + +## Aknowledgements: +This code is inspired by and adapted from the following sources: +- The Hugging Face Tokenizers library (https://github.com/huggingface/tokenizers) +- The OpenAI tiktoken library (https://github.com/openai/tiktoken) + +## References: +1. Reddy, Varshini, et al. "How much is enough? the diminishing returns of tokenization training data." arXiv preprint arXiv:2502.20273 (2025). +2. + +Author: Arthur Testard (arthur.testard.pro@gmail.com) +Please cite this work if the code is helpful to you. +""" +if __name__ == "__main__": + from gpt_lab.utils.logging import init_logger + init_logger() + from gpt_lab.utils.common import get_banner + get_banner(to_print=True) from gpt_lab.tokenizer.tokenizer import Tokenizer from gpt_lab.utils.schemas import TokenizerTrainerConfig, TokenizerConfig @@ -249,7 +292,10 @@ def main(): parser = argparse.ArgumentParser(description="Find the optimal corpus size for training a BPE tokenizer with different vocabulary sizes, and evaluate the trained tokenizers on a simple test set to analyze the trade-offs between corpus size, vocabulary size, training time, and tokenization quality.") parser.add_argument("--write-corpus", action="store_true", help="Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk.") parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--results-path", type=str, default=str(TOKENIZERS_FOLDER / 'scaling_tokenizer_results.pkl'), help="Path to store the results of the tokenizer evaluations.") + parser.add_argument("--vocab-sizes", type=str, default="50000,70000,100000,200000", help="Comma-separated list of vocabulary sizes to train tokenizers with.") + parser.add_argument("--pat-strs", type=str, default=None, help="Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string.") + parser.add_argument("--corpus-sizes-mb", type=str, default=None, help="Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size.") + parser.add_argument("--results-path", type=str, default=str(TOKENIZERS_FOLDER / 'scaling_tokenizer_results.pkl'), help="Path to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results.pkl'. If a file already exists at this path, it will be renamed with a number suffix to avoid overwriting previous results.") parser.add_argument("--compare-truncated-baselines", action="store_true", help="Whether to compare trained tokenizers with truncated versions of baseline tokenizers.") parser.add_argument("--corpus-temperature-alpha", type=float, default=None, help="Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity.") args = parser.parse_args() @@ -324,15 +370,15 @@ def store_results(results_batch, path=results_path): # Corpus size varying with different vocab_sizes and split patterns # patterns = { "pat_str-gpt2": PAT_STR_GPT2, "pat_str-gpt4": PAT_STR_GPT4, "pat_str-punct": PAT_STR_punct, "pat_str-cl100k_base": PAT_STR_cl100k_base, "pat_str-o200k_base": PAT_STR_o200k_base } - patterns = { "pat_str-gpt4": PAT_STR_GPT4 } + patterns = { "pat_str-gpt2": PAT_STR_GPT2 } # patterns = { "PAT_STR_o200k_base": PAT_STR_o200k_base } # TODO: optimize by running the biggest vocab size and slice it on top-k merges for smaller vocabs # vocab_sizes = [10_000, 20_000, 30_000, 50_000, 100_000, 200_000, 300_000, 500_000] - vocab_sizes = [50_000, 70_000, 100_000, 200_000] + vocab_sizes = [int(v) for v in args.vocab_sizes.split(",")] if args.vocab_sizes else [50_000, 70_000, 100_000, 200_000] # vocab_sizes = list(reversed(vocab_sizes)) _max_char_runs = 16 # adjust the divisor to control how many runs are done before storing results to disk, this is a trade-off between memory usage and frequency of saving intermediate results. With 3 processes, we can afford to do more runs before saving, but if you have more memory constraints, you might want to save more frequently by using a smaller divisor. - max_bytes = lambda vocab_size: [int(vocab_size * i * 512) for i in range(1, _max_char_runs+1, 2)] # ~3.5 characters per token on average, adjust as needed based on your corpus + max_bytes = lambda vocab_size: [int(vocab_size * i * 1024) for i in range(1, _max_char_runs+1, 2)] # ~3.5 characters per token on average, adjust as needed based on your corpus # Two options: same name for all tokenizers -> overwrite / different names -> many tokenizers on disk, consider cleaning up after training or implementing a caching mechanism to avoid retraining the same tokenizer multiple times. # name = lambda vocab_size, max_char, p_str_name: f"ic1-tok-{int(vocab_size//1000)}k_maxchar-{max_char//1e6:.1f}M_pattern-{p_str_name}" log0(f"Using {num_procs} processes for tokenizer training.") @@ -400,9 +446,9 @@ def store_results(results_batch, path=results_path): buffer.append(result) results.append(result) - if len(buffer) >= _max_char_runs: - store_results(buffer) - buffer.clear() + # if len(buffer) >= _max_char_runs: + store_results(buffer) + buffer.clear() if buffer: store_results(buffer) diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index be16a14..afb30b0 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -363,14 +363,16 @@ def from_disk(cls, name: str, cachedir: Optional[Union[str, Path]] = None): ) @classmethod - def truncated_from_pretrained(cls, name: str, new_vocab_size: int, source: str = "tiktoken") -> Tokenizer: + def truncated_from_pretrained(cls, name: str, new_vocab_size: int, source: str = "tiktoken", special_tokens: Optional[SpecialTokens] = None) -> Tokenizer: """Delegate truncation to tokenizer.truncation.truncated_from_pretrained (Phase 2). Signature preserved for backward compatibility. """ + if special_tokens is None: + special_tokens = SpecialTokens() from gpt_lab.tokenizer.truncation import truncated_from_pretrained as _trunc - return _trunc(name, new_vocab_size, source=source) + return _trunc(name, new_vocab_size, source=source, special_tokens=special_tokens) @classmethod def get_closest_truncated_from_pretrained(cls, tokenizer: Tokenizer, target_vocab_size: int) -> Tokenizer: diff --git a/src/gpt_lab/tokenizer/truncation.py b/src/gpt_lab/tokenizer/truncation.py index ecdd99e..e46f93e 100644 --- a/src/gpt_lab/tokenizer/truncation.py +++ b/src/gpt_lab/tokenizer/truncation.py @@ -13,6 +13,7 @@ from pathlib import Path from gpt_lab.utils.special_tokens import SpecialTokens +from gpt_lab.tokenizer.serialization import validate_mergeable_ranks # Lightweight module: avoid importing heavy project modules at import time. # Logging is optional; use print() for informational messages here. @@ -94,17 +95,7 @@ def truncated_from_pretrained(base_name: str, new_vocab_size: int, source: str = config=config, ) - # Validate contiguous ranks on the created mergeable_ranks using the - # serialization validation function (loaded lazily to avoid importing - # the package and its heavy dependencies at module import time). - import importlib.util, sys - from pathlib import Path as _P - src_root = _P(__file__).resolve().parents[2] - serial_path = src_root / 'gpt_lab' / 'tokenizer' / 'serialization.py' - spec = importlib.util.spec_from_file_location('tokenizer_serial_local', str(serial_path)) - serial_mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(serial_mod) - serial_mod.validate_mergeable_ranks(new_mergeable) + validate_mergeable_ranks(new_mergeable) # Verify token_bytes semantics (compute from raw bytes keys) sorted_items_new = sorted(new_mergeable.items(), key=lambda x: x[1]) From d5d8f21b4d7d1cff68e36f5aa12e24deaa51bd7a Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 21 May 2026 00:10:51 +0200 Subject: [PATCH 13/18] tokenizer: eval with renyi and efficient entropy --- scripts/benchmark/tokenizer_corpus_size.py | 81 ++++++++++++++++------ 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/scripts/benchmark/tokenizer_corpus_size.py b/scripts/benchmark/tokenizer_corpus_size.py index 1e21cdc..0a3dd05 100644 --- a/scripts/benchmark/tokenizer_corpus_size.py +++ b/scripts/benchmark/tokenizer_corpus_size.py @@ -1,14 +1,21 @@ """ -# Tokenizer Corpus Size Benchmarking Script +# How ByteLevel BPE Tokenization Scales? +## Summary Full recipe for training and scaling tokenizer with different corpus sizes, vocabulary sizes, patterns, -and evaluating the trained tokenizers on a simple test set to analyze the trade-offs between corpus size, -vocabulary size, training time, and tokenization quality. +and evaluating the trained tokenizers on a simple test set to analyze the trade-offs between: +- training corpus size, +- vocabulary size, +- split pattern, +- tokenization quality (compression ration, efficiency, etc.) +- and cross-language generalization (if we have multilingual evaluation sets). There is similar study on studying the optimal corpus size for training a BPE tokenizer as: -- Reddy et al., "How Much is Enough? The Diminishing Returns of Tokenization Training Data", +- [1] in which they find that the returns diminish after 150GB of training data, for BPE tokenizers with 40,960, 64,000, 128,000, and 256,000 vocabulary sizes. -However, this study is focused on training a BPE tokenizer with a specific size. Here, we want to analyze the trade-offs +However, this study is focused on training a BPE tokenizer with a specific size. They conclude that over 150GB a tokenizer with + +Here, we want to analyze the trade-offs between corpus size, vocabulary size, and tokenization quality, and also compare with truncated versions of baseline tokenizers to see how much of the performance can be retained with a smaller vocabulary size. @@ -27,13 +34,14 @@ Recommended: run with `--optim-config-path=configs/optim.yaml` argument. ## Aknowledgements: -This code is inspired by and adapted from the following sources: +This code is inspired by and has some code adapted from the following sources: - The Hugging Face Tokenizers library (https://github.com/huggingface/tokenizers) - The OpenAI tiktoken library (https://github.com/openai/tiktoken) +- nanochat tokenizer code (https://github.com/karpathy/nanochat) for the idea of using HF-training backend + tiktoken-inference backend for efficient training and evaluation of tokenizers. ## References: 1. Reddy, Varshini, et al. "How much is enough? the diminishing returns of tokenization training data." arXiv preprint arXiv:2502.20273 (2025). -2. +2. Zouhar, Vilém, et al. "Tokenization and the noiseless channel." Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2023. Author: Arthur Testard (arthur.testard.pro@gmail.com) Please cite this work if the code is helpful to you. @@ -47,9 +55,10 @@ from gpt_lab.tokenizer.tokenizer import Tokenizer from gpt_lab.utils.schemas import TokenizerTrainerConfig, TokenizerConfig from gpt_lab.tokenizer.corpus import TokenizerCorpus -from gpt_lab.utils.default import PAT_STR_GPT2, PAT_STR_GPT4, PAT_STR_punct, PAT_STR_cl100k_base, PAT_STR_o200k_base, TOKENIZERS_FOLDER, DATA_DIR +from gpt_lab.utils.default import PAT_STR, TOKENIZERS_FOLDER, DATA_DIR from gpt_lab.utils.logging import log0 +import math from pathlib import Path import argparse, pickle, zipfile import time @@ -76,6 +85,38 @@ def load_all_results(path): except EOFError: break return results + +def renyi_entropy(counter, alpha=2.5, eps=1e-12): + """ + Rényi entropy of order alpha as proposed by Zouhar et al. (2023) to measure the diversity of the corpus: + $$H_{\alpha}(X) = (1 / (1 - \alpha)) * \log( \sum_{x \in \mathcal{X}} p(x)^{\alpha})$$ + where p(x) is the probability of token x in the corpus. + - For $\alpha \to 0$, it corresponds to the logarithm of the support size (number of unique tokens). + - For $\alpha \to 1$, it corresponds to the Shannon entropy (the limit as $\alpha$approaches 1). + - For $\alpha \to 2$, it corresponds to the collision entropy, which is related to the probability that two randomly chosen tokens are the same. + - For $\alpha \to \infty$, it corresponds to the min-entropy, which is related to the probability of the most likely token. + """ + total = sum(counter.values()) + if total == 0: + return 0.0 + probabilities = [count / total for count in counter.values()] + if alpha == 1: + return -sum(p * math.log(p) for p in probabilities) + else: + return (1 / (1 - alpha)) * math.log(sum(p ** alpha for p in probabilities)) + +def entropy_efficiency(counter, alpha=2.5, eps=1e-12): + """ + Efficiency of the tokenizer as proposed by Zouhar et al. (2023), defined as the ratio of the Rényi entropy of the token distribution to the logarithm of the vocabulary size: + $$\text{Efficiency} = \frac{H_{\alpha}(X)}{\log(|V|)}$$ + where $H_{\alpha}(X)$ is the Rényi entropy of order $\alpha$ and $|V|$ is the vocabulary size. + This metric captures how well the tokenizer utilizes its vocabulary to represent the diversity of the corpus. A higher efficiency indicates that the tokenizer is effectively using its vocabulary to capture the variability in the data, while a lower efficiency may suggest that many tokens are underutilized or that the tokenizer is not capturing enough diversity. + """ + vocab_size = len(counter) + if vocab_size == 0: + return 0.0 + renyi_ent = renyi_entropy(counter, alpha=alpha, eps=eps) + return renyi_ent / math.log(vocab_size) def enwik8_path(): base_dir = DATA_DIR / "corpus/eval_enwik8" @@ -193,6 +234,10 @@ def eval_tokenizer(tokenizer): metrics[key].append(value) t1 = time.time() res = {key: sum(values) / len(values) for key, values in metrics.items()} + # both are useless actually as we store the counter and can compute any metric we want from it, + # but let's keep them for now as they are easy to compute and can be a quick proxy + res["renyi_entropy"] = renyi_entropy(counter) + res["entropy_efficiency"] = entropy_efficiency(counter) res["nb_tokens"] = len_tokens res["nb_chars"] = len_chars res["nb_bytes"] = len_bytes @@ -290,12 +335,15 @@ def run_tokenizer_experiment(task): def main(): parser = argparse.ArgumentParser(description="Find the optimal corpus size for training a BPE tokenizer with different vocabulary sizes, and evaluate the trained tokenizers on a simple test set to analyze the trade-offs between corpus size, vocabulary size, training time, and tokenization quality.") - parser.add_argument("--write-corpus", action="store_true", help="Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk.") - parser.add_argument("--seed", type=int, default=42) + # General arguments + parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility. Default is 42.") + parser.add_argument("--results-path", type=str, default=str(TOKENIZERS_FOLDER / 'scaling_tokenizer_results.pkl'), help="Path to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results.pkl'. If a file already exists at this path, it will be renamed with a number suffix to avoid overwriting previous results.") + # Tokenizers configuration arguments parser.add_argument("--vocab-sizes", type=str, default="50000,70000,100000,200000", help="Comma-separated list of vocabulary sizes to train tokenizers with.") parser.add_argument("--pat-strs", type=str, default=None, help="Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string.") + # Corpus configuration arguments + parser.add_argument("--write-corpus", action="store_true", help="Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk.") parser.add_argument("--corpus-sizes-mb", type=str, default=None, help="Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size.") - parser.add_argument("--results-path", type=str, default=str(TOKENIZERS_FOLDER / 'scaling_tokenizer_results.pkl'), help="Path to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results.pkl'. If a file already exists at this path, it will be renamed with a number suffix to avoid overwriting previous results.") parser.add_argument("--compare-truncated-baselines", action="store_true", help="Whether to compare trained tokenizers with truncated versions of baseline tokenizers.") parser.add_argument("--corpus-temperature-alpha", type=float, default=None, help="Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity.") args = parser.parse_args() @@ -308,7 +356,6 @@ def main(): results_path = Path(args.results_path) results_path.parent.mkdir(parents=True, exist_ok=True) - if results_path.exists(): backup_path = results_path i = 1 @@ -324,7 +371,6 @@ def main(): results_path.rename(backup_path) log0(f"Existing results file found. Renamed to {backup_path!r} to avoid overwriting. New results will be stored in {results_path!r}.", logger=logger) - def store_results(results_batch, path=results_path): with open(path, "ab") as f: pickle.dump(results_batch, f) @@ -341,11 +387,6 @@ def store_results(results_batch, path=results_path): # pickle.dump(results, f) # Initiate test set and evaluation functions - # testing_sets = ["HuggingFaceFW/fineweb-edu", "HuggingFaceTB/finemath", "codeparrot/codeparrot-clean", ] - # "HuggingFaceFW/fineweb-2" "subset=fra_Latn,jpn_Jpan,kor_Hang,arb_Arab" - - - # Baselines: gpt2, cl100k_base, o200k_base from tiktoken import get_encoding results = load_all_results(results_path) if results_path.exists() else [] @@ -370,13 +411,13 @@ def store_results(results_batch, path=results_path): # Corpus size varying with different vocab_sizes and split patterns # patterns = { "pat_str-gpt2": PAT_STR_GPT2, "pat_str-gpt4": PAT_STR_GPT4, "pat_str-punct": PAT_STR_punct, "pat_str-cl100k_base": PAT_STR_cl100k_base, "pat_str-o200k_base": PAT_STR_o200k_base } - patterns = { "pat_str-gpt2": PAT_STR_GPT2 } + patterns = { "pat_str-gpt2": PAT_STR["gpt2"] } # patterns = { "PAT_STR_o200k_base": PAT_STR_o200k_base } # TODO: optimize by running the biggest vocab size and slice it on top-k merges for smaller vocabs # vocab_sizes = [10_000, 20_000, 30_000, 50_000, 100_000, 200_000, 300_000, 500_000] vocab_sizes = [int(v) for v in args.vocab_sizes.split(",")] if args.vocab_sizes else [50_000, 70_000, 100_000, 200_000] + - # vocab_sizes = list(reversed(vocab_sizes)) _max_char_runs = 16 # adjust the divisor to control how many runs are done before storing results to disk, this is a trade-off between memory usage and frequency of saving intermediate results. With 3 processes, we can afford to do more runs before saving, but if you have more memory constraints, you might want to save more frequently by using a smaller divisor. max_bytes = lambda vocab_size: [int(vocab_size * i * 1024) for i in range(1, _max_char_runs+1, 2)] # ~3.5 characters per token on average, adjust as needed based on your corpus # Two options: same name for all tokenizers -> overwrite / different names -> many tokenizers on disk, consider cleaning up after training or implementing a caching mechanism to avoid retraining the same tokenizer multiple times. From 0b7ee67f4d8808ebdf209b21b83ce8119937260e Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 21 May 2026 16:46:33 +0200 Subject: [PATCH 14/18] tokenizer: split tech report from script+impl args and improve storage for reproducity --- docs/tokenizer_scaling.md | 134 +++++ scripts/benchmark/tokenizer_corpus_size.py | 640 ++++++++++++--------- 2 files changed, 487 insertions(+), 287 deletions(-) create mode 100644 docs/tokenizer_scaling.md diff --git a/docs/tokenizer_scaling.md b/docs/tokenizer_scaling.md new file mode 100644 index 0000000..f7a2649 --- /dev/null +++ b/docs/tokenizer_scaling.md @@ -0,0 +1,134 @@ +# ByteLevel BPE Scaling Experiments + +This document describes the experiments conducted to analyze how ByteLevel BPE tokenization scales with different corpus sizes, vocabulary sizes, and split patterns. The goal is to understand the trade-offs between these factors and their impact on tokenization quality and efficiency. + +However, the results obtained were quite poor compared to the baselines, given that I could not reach the optimal memory budget for training the tokenizers. + +To run the experiments, we run the following command from the root directory of the repo: +```bash +uv run python -m scripts.benchmark.tokenizer_corpus_size \ + --seed 42 \ # for reproducibility + --num-procs 16 \ + --vocab-sizes 20000,50000,100000 \ + --pat-strs gpt2,cl100k_base \ + --write-corpus \ + --corpus-sizes-mb 10,50,100,500,1000,5000,10000 +``` + +Args: +- `--seed`: Random seed for reproducibility. Default is 42. +- `--num-procs`: Number of processes to use for tokenizer training. Defaults to the number of CPU cores available, capped at 32 to avoid overloading the system. +- `--vocab-sizes`: Comma-separated list of vocabulary sizes to train tokenizers with. +- `--pat-strs`: Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string. +- `--write-corpus`: Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk. +- `--corpus-sizes-mb`: Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size. +- `--compare-truncated-baselines`: Whether to compare trained tokenizers with truncated versions of baseline tokenizers. +- `--corpus-temperature-alpha`: Optional temperature parameter to control the randomness of the corpus generation. + +It allows us to train tokenizers with different configurations and evaluate them on a simple test set. The script is designed to be flexible and customizable, allowing us to easily add new sources for training data, new evaluation datasets, and new metrics for evaluating the tokenizers. + +## Summary + +Full recipe for training and scaling tokenizer with different corpus sizes, vocabulary sizes, patterns, +and evaluating the trained tokenizers on a simple test set to analyze the trade-offs between: +- training corpus size, +- vocabulary size, +- split pattern, +- tokenization quality (compression ration, efficiency, etc.) +- and cross-language generalization (if we have multilingual evaluation sets). + +The training data is generated by sampling from a mixture of sources, which can be customized by the user. + +By default, the training data is sampled from a mixture of the following sources ([gpt_lab.tokenizer.corpus](./../src/gpt_lab/tokenizer/corpus.py#L339)): +- English web text (from fineweb-edu) +- Multilingual web text (from fineweb-2) +- Maths text (from finemath-4plus) +- Python code (from codeparrot-clean) +To keep the same logic between the different runs, we create the training data once, store it on disk, and then use it for all the different runs. +This also allows us to analyze the impact of the corpus size without having to worry about the randomness of the data generation process. + +The evaluation is done on the following datasets (eval_configs in the code): +- enwik8 (for English text) +- fineweb-edu (for English web text) +- fineweb-2 (for multilingual web text, with subsets for different languages) +- finemath-4plus (for math text) +- github-top-code (for Python code) + +> [!WARNING] +> With it current implementation, the script may use the same samples for both training and evaluation, +which can lead to overfitting and an overestimation of the tokenizer's performance. +> However, the results obtained were quite poored compared to the baselines, given that I could not reach +the optimal memory budget for training the tokenizers. +> Hence, in case of future runs with **exceptionally good results**, it would be important to check whether +the training and evaluation samples are overlapping, and if so, to implement a proper train/eval +split to get a more accurate estimate of the tokenizer's performance. + +> [!NOTE] +> The code is designed to be flexible and customizable, allowing you to easily add new sources for +training data, new evaluation datasets, and new metrics for evaluating the tokenizers. + + +There is similar study on studying the optimal corpus size for training a BPE tokenizer as: +- [1] in which they find that the returns diminish after 150GB of training data, for BPE tokenizers with 40,960, 64,000, 128,000, and 256,000 vocabulary sizes. + +However, this study is focused on training a BPE tokenizer with a specific size. They conclude that over 150GB of training data, the performance improvements become marginal, +suggesting that there is an optimal corpus size for training BPE tokenizers. + +Here, we want to analyze the trade-offs between corpus size, vocabulary size, and split pattern, +Then, we also compare with truncated versions of baseline tokenizers to see how much of the performance can be retained with a smaller vocabulary size. + +This is mainly motivated by the following facts: +- Language model have been scaled up but tokenizers sizes have not been scaled up as much, and it is not clear how much the tokenizer performance can be improved by scaling up the tokenizer training corpus and vocabulary size. +- According to [3], Language model performance is sensitive to tokenizer size, and the optimal size is often larger than the commonly used 50-100k tokens, especially for larger models. + +## Usage + +How to run it from root directory of the repo: + +options: + -h, --help show the help message and exit + --seed SEED + Random seed for reproducibility. Default is 42. + --num-procs NUM_PROCS + Number of processes to use for tokenizer training. Defaults to the number of CPU cores available, capped at 32 to avoid overloading the system. + --vocab-sizes VOCAB_SIZES + Comma-separated list of vocabulary sizes to train tokenizers with. + --pat-strs PAT_STRS + Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string. + --write-corpus + Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk. + --corpus-sizes-mb CORPUS_SIZES_MB + Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size. + --compare-truncated-baselines + Whether to compare trained tokenizers with truncated versions of baseline tokenizers. + --corpus-temperature-alpha CORPUS_TEMPERATURE_ALPHA + Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it + more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity. + --resume + Whether to resume from existing results file. If set, the script will attempt to load existing results from the specified results path and continue from there, + skipping any experiments that have already been completed. This can be useful for long-running experiments that may be interrupted or for iteratively adding new + configurations without re-running everything. + --results-path RESULTS_PATH + Path to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results.pkl'. If a file already exists at this path, it + will be renamed with a number suffix to avoid overwriting previous results. + +- Make a new scaling run with new corpus sizes: + +## Aknowledgements: + +This experiment is inspired by and has some code adapted from the following sources: +- The Hugging Face Tokenizers library (https://github.com/huggingface/tokenizers) +- The OpenAI tiktoken library (https://github.com/openai/tiktoken) +- nanochat tokenizer code (https://github.com/karpathy/nanochat) for the idea of using HF-training backend + tiktoken-inference backend for efficient training and evaluation of tokenizers. + +## References: +1. Reddy, Varshini, et al. "How much is enough? the diminishing returns of tokenization training data." arXiv preprint arXiv:2502.20273 (2025). +2. Zouhar, Vilém, et al. "Tokenization and the noiseless channel." Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2023. +3. Tao, Chaofan, et al. "Scaling laws with vocabulary: Larger models deserve larger vocabularies." Advances in Neural Information Processing Systems 37 (2024): 114147-114179. +4. Karpathy, Andrej. "Let’s Build the GPT Tokenizer: A Complete Guide to Tokenization in LLMs. A text and code version of Karpathy’s famous tokenizer video." https://www.fast.ai/posts/2025-10-16-karpathy-tokenizers.html (2025). + +## Contributing: +- If you want to contribute to this project, please feel free to open an issue or a pull request. Any contributions are welcome, whether it's fixing a bug, adding a new feature, or improving the documentation. + +Author: Arthur Testard (arthur.testard.pro@gmail.com) \ +Please cite this work if the code is helpful to you. \ No newline at end of file diff --git a/scripts/benchmark/tokenizer_corpus_size.py b/scripts/benchmark/tokenizer_corpus_size.py index 0a3dd05..9ef81d4 100644 --- a/scripts/benchmark/tokenizer_corpus_size.py +++ b/scripts/benchmark/tokenizer_corpus_size.py @@ -1,7 +1,8 @@ """ -# How ByteLevel BPE Tokenization Scales? +# ByteLevel BPE Scaling Experiments ## Summary + Full recipe for training and scaling tokenizer with different corpus sizes, vocabulary sizes, patterns, and evaluating the trained tokenizers on a simple test set to analyze the trade-offs between: - training corpus size, @@ -10,28 +11,69 @@ - tokenization quality (compression ration, efficiency, etc.) - and cross-language generalization (if we have multilingual evaluation sets). -There is similar study on studying the optimal corpus size for training a BPE tokenizer as: -- [1] in which they find that the returns diminish after 150GB of training data, for BPE tokenizers with 40,960, 64,000, 128,000, and 256,000 vocabulary sizes. - -However, this study is focused on training a BPE tokenizer with a specific size. They conclude that over 150GB a tokenizer with - -Here, we want to analyze the trade-offs -between corpus size, vocabulary size, and tokenization quality, and also compare with truncated versions of baseline -tokenizers to see how much of the performance can be retained with a smaller vocabulary size. - -This is mainly motivated by the following facts: -- Language model have been scaled up but tokenizers sizes have not been scaled up as much, and it is not clear how much the tokenizer performance can be improved by scaling up the tokenizer training corpus and vocabulary size. -- According to [2], Language model performance is sensitive to tokenizer size, and the optimal size is often larger than the commonly used 50k tokens, especially for larger models and more diverse corpora. +The training data is generated by sampling from a mixture of sources, which can be customized by the user. + +By default, the training data is sampled from a mixture of the following sources ([gpt_lab.tokenizer.corpus](./../src/gpt_lab/tokenizer/corpus.py)): +- English web text (from fineweb-edu) +- Multilingual web text (from fineweb-2) +- Maths text (from finemath-4plus) +- Python code (from codeparrot-clean) + +To keep the same logic between the different runs, we create the training data once, store it on disk, and then use it for all the different runs. +This also allows us to analyze the impact of the corpus size without having to worry about the randomness of the data generation process. + +The evaluation is done on the following datasets (eval_configs in the code): +- enwik8 (for English text) +- fineweb-edu (for English web text) +- fineweb-2 (for multilingual web text, with subsets for different languages) +- finemath-4plus (for math text) +- github-top-code (for Python code) + +> [!WARNING] +> With it current implementation, the script may use the same samples for both training and evaluation, +which can lead to overfitting and an overestimation of the tokenizer's performance. +> However, the results obtained were quite poored compared to the baselines, given that I could not reach +the optimal memory budget for training the tokenizers. +> Hence, in case of future runs with **exceptionally good results**, it would be important to check whether +the training and evaluation samples are overlapping, and if so, to implement a proper train/eval +split to get a more accurate estimate of the tokenizer's performance. ## Usage How to run it from root directory of the repo: -- Make a new scaling run with new corpus sizes: - +```bash +uv run python -m scripts.benchmark.tokenizer_corpus_size +``` + +options: + -h, --help show the help message and exit + --seed SEED + Random seed for reproducibility. Default is 42. + --num-procs NUM_PROCS + Number of processes to use for tokenizer training. Defaults to the number of CPU cores available, capped at 32 to avoid overloading the system. + --vocab-sizes VOCAB_SIZES + Comma-separated list of vocabulary sizes to train tokenizers with. + --pat-strs PAT_STRS + Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string. + --write-corpus + Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk. + --corpus-sizes-mb CORPUS_SIZES_MB + Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size. + --compare-truncated-baselines + Whether to compare trained tokenizers with truncated versions of baseline tokenizers. + --corpus-temperature-alpha CORPUS_TEMPERATURE_ALPHA + Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it + more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity. + --resume + Whether to resume from existing results file. If set, the script will attempt to load existing results from the specified results path and continue from there, + skipping any experiments that have already been completed. This can be useful for long-running experiments that may be interrupted or for iteratively adding new + configurations without re-running everything. + --results-path RESULTS_PATH + Path to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results.pkl'. If a file already exists at this path, it + will be renamed with a number suffix to avoid overwriting previous results. -[!NOTE] -Recommended: run with `--optim-config-path=configs/optim.yaml` argument. +- Make a new scaling run with new corpus sizes: ## Aknowledgements: This code is inspired by and has some code adapted from the following sources: @@ -42,10 +84,14 @@ ## References: 1. Reddy, Varshini, et al. "How much is enough? the diminishing returns of tokenization training data." arXiv preprint arXiv:2502.20273 (2025). 2. Zouhar, Vilém, et al. "Tokenization and the noiseless channel." Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2023. +3. Tao, Chaofan, et al. "Scaling laws with vocabulary: Larger models deserve larger vocabularies." Advances in Neural Information Processing Systems 37 (2024): 114147-114179. +4. Karpathy, Andrej. "Let’s Build the GPT Tokenizer: A Complete Guide to Tokenization in LLMs. A text and code version of Karpathy’s famous tokenizer video." https://www.fast.ai/posts/2025-10-16-karpathy-tokenizers.html (2025). Author: Arthur Testard (arthur.testard.pro@gmail.com) Please cite this work if the code is helpful to you. """ +from __future__ import annotations + if __name__ == "__main__": from gpt_lab.utils.logging import init_logger init_logger() @@ -58,33 +104,219 @@ from gpt_lab.utils.default import PAT_STR, TOKENIZERS_FOLDER, DATA_DIR from gpt_lab.utils.logging import log0 +import os import math +import json from pathlib import Path import argparse, pickle, zipfile import time from tqdm import tqdm from collections import Counter +from typing import List, Optional + import regex as re import logging logger = logging.getLogger(__name__) -BASELINES = ["gpt2", "cl100k_base", "o200k_base"] +_BASELINES = ["gpt2", "cl100k_base", "o200k_base"] + +byte_per_doc = lambda max_byte: max_byte // 10_000 # Default to 1000 documents if not specified, adjust as needed + +def main(): + parser = argparse.ArgumentParser(description="Find the optimal corpus size for training a BPE tokenizer with different vocabulary sizes, and evaluate the trained tokenizers on a simple test set to analyze the trade-offs between corpus size, vocabulary size, training time, and tokenization quality.") + # General arguments + parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility. Default is 42.") + parser.add_argument("--num-procs", type=int, default=None, help="Number of processes to use for tokenizer training. Defaults to the number of CPU cores available, capped at 32 to avoid overloading the system.") + # Tokenizers configuration arguments + parser.add_argument("--baselines", type=str, default=",".join(_BASELINES), help=f"Comma-separated list of baseline tokenizers to compare with. Default is {','.join(_BASELINES)}.") + parser.add_argument("--vocab-sizes", type=str, default="50000,70000,100000,200000", help="Comma-separated list of vocabulary sizes to train tokenizers with.") + parser.add_argument("--pat-strs", type=str, default=None, help="Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string.") + # Corpus configuration arguments + parser.add_argument("--write-corpus", action="store_true", help="Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk.") + parser.add_argument("--corpus-dir", type=str, default=None, help="Path to the directory to store the generated corpus. If not specified, defaults to './.gpt_lab/corpus/scaling_corpus'.") + parser.add_argument("--corpus-sizes-gb", type=str, default=None, help="Comma-separated list of corpus sizes in gigabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size.") + parser.add_argument("--compare-truncated-baselines", action="store_true", help="Whether to compare trained tokenizers with truncated versions of baseline tokenizers.") + parser.add_argument("--corpus-temperature-alpha", type=float, default=None, help="Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity.") + # Resuming and caching arguments + parser.add_argument("--resume", action="store_true", help="Whether to resume from existing results file. If set, the script will attempt to load existing results from the specified results path and continue from there, skipping any experiments that have already been completed. This can be useful for long-running experiments that may be interrupted or for iteratively adding new configurations without re-running everything.") + parser.add_argument("--result-dir", type=str, default=str(TOKENIZERS_FOLDER / 'scaling_tokenizer_results'), help="Path to the directory to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results'. If a file already exists at this path, it will be renamed with a number suffix to avoid overwriting previous results.") + parser.add_argument("--save-every", type=int, default=10, help="Number of runs to execute before saving intermediate results to disk. This can help prevent data loss in case of interruptions and allows for monitoring progress during long experiments. Default is 10.") + # Monitoring + parser.add_argument("--board", default="dummy", help="Whether to use Board module for monitoring. Options are 'tensorboard', 'wandb', or 'dummy' (no monitoring). Default is 'dummy'.") + + args = parser.parse_args() + + # initiate results storage + # create results path if it doesn't exist + # and backup existing file if it does to avoid overwriting/mixing previous results + result_dir = Path(args.result_dir) + result_dir.parent.mkdir(parents=True, exist_ok=True) + if result_dir.exists() and not args.resume: + backup_dir = result_dir + i = 1 + while backup_dir.exists(): + with open(backup_dir, "rb") as f: + results = pickle.load(f) + if results == [] or results is None: + log0(f"Existing results file {backup_dir} is empty. It will be overwritten with new results.", logger=logger) + break + new_name = result_dir.stem + f"_{i}" + backup_dir = backup_dir.with_stem(new_name) + i += 1 + result_dir.rename(backup_dir) + log0(f"Existing results file found. New results will be stored in {result_dir!r} to avoid overwriting.", logger=logger) + + exp_name = result_dir.stem + + # Store experiment metadata for reproducibility and analysis + meta_path = (result_dir / "meta").with_suffix(".json") + if meta_path.exists(): + with open(meta_path, "r") as f: + meta = json.load(f) + else: + meta = dict( + seed=args.seed, + baselines=args.baselines, + num_procs=args.num_procs, + vocab_sizes=args.vocab_sizes, + pat_strs=args.pat_strs, + write_corpus=args.write_corpus, + corpus_sizes_gb=args.corpus_sizes_gb, + compare_truncated_baselines=args.compare_truncated_baselines, + corpus_temperature_alpha=args.corpus_temperature_alpha, + corpus_dir=args.corpus_dir, + ) + with open(meta_path, "w") as f: + json.dump(meta, f, indent=4) + + # Solve args and configurations + num_procs = min(os.cpu_count(), 32) if meta.get("num_procs") is None else meta["num_procs"] + + vocab_sizes = [int(v) for v in meta.get("vocab_sizes", "50_000, 70_000, 100_000, 200_000").split(",")] + patterns = { f"pat_str-{s.strip()}": PAT_STR.get(s.strip()) for s in meta.get("pat_strs", "gpt2").split(",") if s.strip() in PAT_STR } if meta.get("pat_strs") else {"pat_str-gpt2": PAT_STR["gpt2"]} + corpus_sizes = [float(s) * 1024 * 1024 * 1024 for s in meta.get("corpus_sizes_gb", None).split(",")] if meta.get("corpus_sizes_gb") else None + baselines = [ b.strip() for b in meta.get("baselines", ",".join(_BASELINES)).split(",") if b.strip() in _BASELINES ] if meta.get("baselines") else _BASELINES + + from tiktoken import get_encoding + + result_paths = load_all_result_paths(result_dir) if result_dir.exists() else [] + + for baseline in baselines: + if baseline not in result_paths: + enc = get_encoding(baseline) + evaluation = eval_tokenizer(enc) + result = dict( + vocab_size=enc.n_vocab, + pattern=baseline, + max_chars=None, + config=None, + training_time=None, + corpus_size_mb=None, + evaluation=evaluation, + baseline=baseline, + ) + store_single_result(result, result_dir / f"{baseline}.pkl") + + # TODO: add truncated versions of the baselines as well + + tokenizer_name = lambda vocab_size, max_char, p_str_name: f"{exp_name}-{int(vocab_size//1000)}k_maxchar-{max_char//1e6:.1f}M_pattern-{p_str_name}" + tokenizer_path = lambda vocab_size, max_char, p_str_name: result_dir / f"{tokenizer_name(vocab_size, max_char, p_str_name)}.pkl" + + log0(f"Using {num_procs} processes for tokenizer training.") + corpus_path = DATA_DIR / "corpus" / exp_name + corpus_bytemax = max(corpus_sizes) + + if args.write_corpus: + log0(f"Writing corpus to {corpus_path} with max bytes {corpus_bytemax:,}...", logger=logger) + corpus = TokenizerCorpus.write_from_sources( + corpus_dir=corpus_path, + max_bytes=corpus_bytemax, + bytes_per_doc=byte_per_doc(corpus_bytemax), + random_seed=meta.get("seed"), + temperature_alpha=meta.get("corpus_temperature_alpha"), + ) + log0(f"Corpus written to {corpus_path}. Size: {sum(c.stat().st_size / 1e6 for c in corpus_path.glob('*.txt')):.2f} MB", logger=logger) + + if not args.write_corpus: + log0(f"Using existing corpus at {corpus_path} with max bytes {corpus_bytemax:,} for tokenizer training.", logger=logger) + if not corpus_path.exists(): + raise FileNotFoundError(f"Corpus path {corpus_path} does not exist. Please run the script with --write-corpus flag to create the corpus before training tokenizers.") + corpus = TokenizerCorpus.from_sources(corpus_dir=corpus_path) + corpus.show_stats() + + # selecting runs to execute based on the existing results and the configuration + tasks = [] + for vocab_size in vocab_sizes: + for p_str_name, p_str in patterns.items(): + for max_byte in corpus_sizes: + name = tokenizer_name(vocab_size, max_byte, p_str_name) + if name in result_paths: + log0(f"Skipping already completed run for vocab_size={vocab_size}, pattern={p_str_name}, max_byte={max_byte}.", logger=logger) + continue + + tasks.append( + ( + name, + vocab_size, + p_str_name, + p_str, + max_byte, + corpus_path, + corpus_bytemax, + meta, + ) + ) -# easy -def print(*args, **kwargs): - log0(" ".join(str(arg) for arg in args), **kwargs, logger=logger) + import multiprocessing as mp + from concurrent.futures import ProcessPoolExecutor, as_completed -def load_all_results(path): + mp.set_start_method("spawn", force=True) results = [] - with open(path, "rb") as f: - while True: + + t_total_start = time.time() + with ProcessPoolExecutor(max_workers=meta.get("num_procs")) as executor: + futures = [executor.submit(run_tokenizer_experiment, t) for t in tasks] + + buffer = [] + for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Tokenizer experiments")): + result = future.result() + _ = None # placeholder for run name or other metadata if needed + buffer.append((result, _)) + + if len(buffer) >= args.save_every: + store_buffered_results(buffer, result_dir) + + if buffer: + store_buffered_results(buffer, result_dir) + total_time = time.time() - t_total_start + + log0(f"Total time for all runs: {total_time/3600:.2f} hours.", logger=logger) + log0(f"All runs completed. Results stored in {result_dir}.", logger=logger) + +def load_all_result_paths(result_dir: Path) -> List[str]: + result_paths = [] + for path in result_dir.glob("*.pkl"): + with open(path, "rb") as f: try: - results.extend(pickle.load(f)) + # check if the file is empty or corrupted + result = pickle.load(f) + if result is not None: + result_paths.append(path.stem) except EOFError: - break - return results + continue + return result_paths + +def store_single_result(result, path): + with open(path, "wb") as f: + pickle.dump(result, f) + +def store_buffered_results(buffer, result_dir): + for result, run_name in buffer: + path = result_dir / run_name + store_single_result(result, path) + buffer.clear() def renyi_entropy(counter, alpha=2.5, eps=1e-12): """ @@ -118,74 +350,6 @@ def entropy_efficiency(counter, alpha=2.5, eps=1e-12): renyi_ent = renyi_entropy(counter, alpha=alpha, eps=eps) return renyi_ent / math.log(vocab_size) -def enwik8_path(): - base_dir = DATA_DIR / "corpus/eval_enwik8" - base_dir.mkdir(parents=True, exist_ok=True) - # download and unzip enwik8 to cache directory - enwik8_url = "https://mattmahoney.net/dc/enwik8.zip" - enwik8_local_path = base_dir.joinpath("enwik8") - enwik8_local_path_zip = base_dir.joinpath("enwik8.zip") - if not enwik8_local_path.exists(): - print(f"Downloading enwik8 to {enwik8_local_path_zip}") - import requests - response = requests.get(enwik8_url) - with open(enwik8_local_path_zip, "wb") as f: - f.write(response.content) - with zipfile.ZipFile(enwik8_local_path_zip, "r") as zip_ref: - zip_ref.extractall(base_dir) - print(f"Unzipped enwik8 to {enwik8_local_path}") - enwik8_local_path_zip.unlink() - print(f"Removed {enwik8_local_path_zip}") - else: - print(f"Using existing enwik8 at {enwik8_local_path}") - return enwik8_local_path - -enwik8_path = enwik8_path() - -def enwik8_loader(): - with open(enwik8_path, "r", encoding="utf-8") as f: - return f.read(10**7).split("\n") - -eval_configs = { - "enwik8": dict( - loader_fn=enwik8_loader, - ), - "HuggingFaceFW/fineweb-edu": dict( - split="train" # no test or validation split available - ), - "HuggingFaceTB/finemath": dict( - split="train", - name=["finemath-3plus"] - ), - "ronantakizawa/github-top-code": dict( - filter_fn=lambda x: x["file_language"] == "Python" # filter for python files only - ), - "HuggingFaceFW/fineweb-2": dict( - name=["fra_Latn", "jpn_Jpan", "kor_Hang", "arb_Arab"], - ) -} -eval_sets = [] -# prepare config to match the expected input of TokenizerCorpus.from_sources -for ds_name, ds_config in eval_configs.items(): - _ds = dict(name=ds_name) - _ds["split"] = ds_config.get("split", "test") - _ds["loader_fn"] = ds_config.get("loader_fn", None) - _ds["generator_source"] = dict(path=ds_name, weight=1.0) - if "filter_fn" in ds_config: - _ds["generator_source"]["filter_fn"] = ds_config["filter_fn"] - if ds_config.get("name", []) == []: - _ds["localdir"] = DATA_DIR / f"corpus/eval_{ds_name.replace('/', '_')}" - _ds["metricname"] = f"{ds_name.split('/')[-1]}" - eval_sets.append(_ds) - else: - for name in ds_config["name"]: - _ds = _ds.copy() - _ds["generator_source"] = _ds["generator_source"].copy() # have to copy to avoid mutating the original for the next iteration - _ds["subset"] = name - _ds["localdir"] = DATA_DIR / f"corpus/eval_{ds_name.replace('/', '_')}:{name}" - _ds["generator_source"]["name"] = name - _ds["metricname"] = f"{ds_name.split('/')[-1]}:{name}" - eval_sets.append(_ds) def get_eval_corpus(eval_set): return TokenizerCorpus.from_sources( @@ -202,9 +366,14 @@ def get_eval_corpus(eval_set): def eval_tokenizer(tokenizer): results = {} + _counter.update(tokens) + _token_len_cache = { + tok: len(tokenizer.decode([tok])) + for tok in range(tokenizer.n_vocab) + } for eval_set in eval_sets: metrics = dict() - counter = Counter() + _counter = Counter() len_tokens = 0 len_chars = 0 len_bytes = 0 @@ -214,19 +383,20 @@ def eval_tokenizer(tokenizer): if not text.strip(): continue tokens = tokenizer.encode(text, disallowed_special=()) - counter.update(tokens) len_tokens += len(tokens) len_chars += len(text) len_bytes += len(text.encode("utf-8")) decoded = tokenizer.decode(tokens) acc = decoded == text compression_ratio = len(tokens) / len(text) if len(text) > 0 else 0 + tokens_per_bytes = len(tokens) / len_bytes # maybe optimized - char_by_token = [len(tokenizer.decode([tok])) for tok in tokens] + char_by_token = [_token_len_cache[tok] for tok in tokens] char_by_token_avg = sum(char_by_token) / len(char_by_token) if len(char_by_token) > 0 else 0 for key, value in [ ("accuracy", acc), ("compression_ratio", compression_ratio), + ("tokens_per_bytes", tokens_per_bytes), ("nb_char_by_token_avg", char_by_token_avg) ]: if key not in metrics: @@ -236,12 +406,13 @@ def eval_tokenizer(tokenizer): res = {key: sum(values) / len(values) for key, values in metrics.items()} # both are useless actually as we store the counter and can compute any metric we want from it, # but let's keep them for now as they are easy to compute and can be a quick proxy - res["renyi_entropy"] = renyi_entropy(counter) - res["entropy_efficiency"] = entropy_efficiency(counter) + res["renyi_entropy"] = renyi_entropy(_counter) + res["entropy_efficiency"] = entropy_efficiency(_counter) res["nb_tokens"] = len_tokens res["nb_chars"] = len_chars res["nb_bytes"] = len_bytes - res["token_counter"] = counter + # NOTE: probably too heavy to store the counter for all the runs + # res["token_counter"] = counter res["eval_time"] = t1 - t0 results[eval_set["metricname"]] = res return results @@ -250,13 +421,13 @@ def compare_with_truncated_baselines(target_vocab_size): comparisons = {} from tiktoken import get_encoding - for baseline in BASELINES: + for baseline in _BASELINES: baseline_vocab_size = get_encoding(baseline).n_vocab if baseline_vocab_size <= target_vocab_size: continue truncated_name = f"{baseline}_truncated_{target_vocab_size}" - truncated_tokenizer = Tokenizer.from_pretrained(truncated_name) + truncated_tokenizer = Tokenizer.truncated_from_pretrained(baseline, target_vocab_size) comparisons[baseline] = { "base_vocab_size": baseline_vocab_size, "truncated_name": truncated_name, @@ -265,16 +436,21 @@ def compare_with_truncated_baselines(target_vocab_size): return comparisons + +# -------------------------------------------------------------------- +# Parallel execution of tokenizer training and evaluation +# -------------------------------------------------------------------- + def run_tokenizer_experiment(task): - ( + ( + name, vocab_size, p_str_name, p_str, max_bytes, corpus_path, corpus_bytemax, - seed, - name, + meta ) = task num_procs = 1 # IMPORTANT: avoid nested parallelism corpus = TokenizerCorpus.from_sources( @@ -282,7 +458,7 @@ def run_tokenizer_experiment(task): sources=None, max_bytes=corpus_bytemax, bytes_per_doc=corpus_bytemax // 20_000, - random_seed=seed, + random_seed=meta.get("seed"), ) trainer_config = TokenizerTrainerConfig( max_bytes=max_bytes, @@ -297,10 +473,10 @@ def run_tokenizer_experiment(task): name=name, vocab_size=vocab_size, pat_str=p_str, - trainer=trainer_config, source="huggingface", # this is quite dummy save_token_bytes=False, # we will compute token bytes on the fly without saving to disk to avoid IO overhead, adjust as needed based on your use case and whether you want to inspect the token bytes files # special_tokens=SpecialTokens(), # using default special tokens, adjust as needed + trainer=trainer_config, ) t0 = time.time() tokenizer = Tokenizer.train_from_iterator( @@ -316,7 +492,7 @@ def run_tokenizer_experiment(task): "tokenizer_name": name, "config": str(config), "training_time": t1 - t0, - "corpus_size_mb": corpus_path.stat().st_size / 1e6, + "corpus_size_gb": corpus_path.stat().st_size / 1e9, } for text in corpus.iterator(max_bytes=max_bytes): @@ -331,188 +507,78 @@ def run_tokenizer_experiment(task): del tokenizer return result -byte_per_doc = lambda max_byte: max_byte // 10_000 # Default to 1000 documents if not specified, adjust as needed - -def main(): - parser = argparse.ArgumentParser(description="Find the optimal corpus size for training a BPE tokenizer with different vocabulary sizes, and evaluate the trained tokenizers on a simple test set to analyze the trade-offs between corpus size, vocabulary size, training time, and tokenization quality.") - # General arguments - parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility. Default is 42.") - parser.add_argument("--results-path", type=str, default=str(TOKENIZERS_FOLDER / 'scaling_tokenizer_results.pkl'), help="Path to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results.pkl'. If a file already exists at this path, it will be renamed with a number suffix to avoid overwriting previous results.") - # Tokenizers configuration arguments - parser.add_argument("--vocab-sizes", type=str, default="50000,70000,100000,200000", help="Comma-separated list of vocabulary sizes to train tokenizers with.") - parser.add_argument("--pat-strs", type=str, default=None, help="Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string.") - # Corpus configuration arguments - parser.add_argument("--write-corpus", action="store_true", help="Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk.") - parser.add_argument("--corpus-sizes-mb", type=str, default=None, help="Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size.") - parser.add_argument("--compare-truncated-baselines", action="store_true", help="Whether to compare trained tokenizers with truncated versions of baseline tokenizers.") - parser.add_argument("--corpus-temperature-alpha", type=float, default=None, help="Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity.") - args = parser.parse_args() - import os - num_procs = min(os.cpu_count(), 32) - - # initiate results storage - # create results path if it doesn't exist - # and backup existing file if it does to avoid overwriting/mixing previous results - results_path = Path(args.results_path) - results_path.parent.mkdir(parents=True, exist_ok=True) - - if results_path.exists(): - backup_path = results_path - i = 1 - while backup_path.exists(): - with open(backup_path, "rb") as f: - results = pickle.load(f) - if results == [] or results is None: - log0(f"Existing results file {backup_path} is empty. It will be overwritten with new results.", logger=logger) - break - new_name = results_path.stem + f"_{i}" - backup_path = backup_path.with_stem(new_name) - i += 1 - results_path.rename(backup_path) - log0(f"Existing results file found. Renamed to {backup_path!r} to avoid overwriting. New results will be stored in {results_path!r}.", logger=logger) - - def store_results(results_batch, path=results_path): - with open(path, "ab") as f: - pickle.dump(results_batch, f) - - # try: - # with open(path, "rb") as f: - # results = pickle.load(f) - # except FileNotFoundError: - # results = [] - - # results.extend(results_batch) +# -------------------------------------------------------------------- +# Prepare evaluation sets configurations and loaders +# -------------------------------------------------------------------- - # with open(path, "wb") as f: - # pickle.dump(results, f) - # Initiate test set and evaluation functions - - from tiktoken import get_encoding - - results = load_all_results(results_path) if results_path.exists() else [] - if len(results) == 0: - results = [] - for baseline in BASELINES: - enc = get_encoding(baseline) - evaluation = eval_tokenizer(enc) - result = dict( - vocab_size=enc.n_vocab, - pattern=baseline, - max_chars=None, - config=None, - training_time=None, - corpus_size_mb=None, - evaluation=evaluation, - baseline=baseline, - ) - results.append(result) - store_results(results) - - - # Corpus size varying with different vocab_sizes and split patterns - # patterns = { "pat_str-gpt2": PAT_STR_GPT2, "pat_str-gpt4": PAT_STR_GPT4, "pat_str-punct": PAT_STR_punct, "pat_str-cl100k_base": PAT_STR_cl100k_base, "pat_str-o200k_base": PAT_STR_o200k_base } - patterns = { "pat_str-gpt2": PAT_STR["gpt2"] } - # patterns = { "PAT_STR_o200k_base": PAT_STR_o200k_base } - # TODO: optimize by running the biggest vocab size and slice it on top-k merges for smaller vocabs - # vocab_sizes = [10_000, 20_000, 30_000, 50_000, 100_000, 200_000, 300_000, 500_000] - vocab_sizes = [int(v) for v in args.vocab_sizes.split(",")] if args.vocab_sizes else [50_000, 70_000, 100_000, 200_000] - - - _max_char_runs = 16 # adjust the divisor to control how many runs are done before storing results to disk, this is a trade-off between memory usage and frequency of saving intermediate results. With 3 processes, we can afford to do more runs before saving, but if you have more memory constraints, you might want to save more frequently by using a smaller divisor. - max_bytes = lambda vocab_size: [int(vocab_size * i * 1024) for i in range(1, _max_char_runs+1, 2)] # ~3.5 characters per token on average, adjust as needed based on your corpus - # Two options: same name for all tokenizers -> overwrite / different names -> many tokenizers on disk, consider cleaning up after training or implementing a caching mechanism to avoid retraining the same tokenizer multiple times. - # name = lambda vocab_size, max_char, p_str_name: f"ic1-tok-{int(vocab_size//1000)}k_maxchar-{max_char//1e6:.1f}M_pattern-{p_str_name}" - log0(f"Using {num_procs} processes for tokenizer training.") - corpus_path = DATA_DIR / "corpus" / results_path.stem - results = [] - corpus_bytemax = max(max_bytes(max(vocab_sizes))) - - if args.write_corpus: - print(f"Writing corpus to {corpus_path} with max bytes {corpus_bytemax:,}...") - corpus = TokenizerCorpus.write_from_sources( - corpus_dir=corpus_path, - max_bytes=corpus_bytemax, - bytes_per_doc=byte_per_doc(corpus_bytemax), - random_seed=args.seed, - temperature_alpha=args.corpus_temperature_alpha, - ) - print(f"Corpus written to {corpus_path}. Size: {sum(c.stat().st_size / 1e6 for c in corpus_path.glob('*.txt')):.2f} MB") - # Prepare run configurations - if not args.write_corpus: - print(f"Using existing corpus at {corpus_path} with max bytes {corpus_bytemax:,} for tokenizer training.") - if not corpus_path.exists(): - raise FileNotFoundError(f"Corpus path {corpus_path} does not exist. Please run the script with --write-corpus flag to create the corpus before training tokenizers.") - corpus = TokenizerCorpus.from_sources(corpus_dir=corpus_path) - corpus.show_stats() - tasks = [] - - for vocab_size in vocab_sizes: - for p_str_name, p_str in patterns.items(): - for max_byte in max_bytes(vocab_size): - tasks.append( - ( - vocab_size, - p_str_name, - p_str, - max_byte, - corpus_path, - corpus_bytemax, - args.seed, - f"ic1-scaling-tok-{p_str_name}-v{vocab_size}-b{max_byte//1e6:.1f}M", - ) - ) - - t_total_start = time.time() - import multiprocessing as mp - from concurrent.futures import ProcessPoolExecutor, as_completed - - mp.set_start_method("spawn", force=True) - max_workers = min(os.cpu_count(), 4) # be conservative - results = [] - # tasks_chunks = [tasks[i:i + max_workers] for i in range(0, len(tasks), _max_char_runs)] - # for chunk in tqdm(tasks_chunks, desc="Processing task chunks"): - # with ProcessPoolExecutor(max_workers=max_workers) as executor: - # futures = [executor.submit(run_tokenizer_experiment, task) for task in chunk] - # for future in tqdm(as_completed(futures), total=len(futures), desc="Tokenizer experiments"): - # results.append(future.result()) - - # store_results(results) - # results = [] # Reset results list for next chunk - with ProcessPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(run_tokenizer_experiment, t) for t in tasks] - - buffer = [] - for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Tokenizer experiments")): - result = future.result() - buffer.append(result) - results.append(result) +def enwik8_path(): + base_dir = DATA_DIR / "corpus/eval_enwik8" + base_dir.mkdir(parents=True, exist_ok=True) + # download and unzip enwik8 to cache directory + enwik8_url = "https://mattmahoney.net/dc/enwik8.zip" + enwik8_local_path = base_dir.joinpath("enwik8") + enwik8_local_path_zip = base_dir.joinpath("enwik8.zip") + if not enwik8_local_path.exists(): + log0(f"Downloading enwik8 to {enwik8_local_path_zip}", logger=logger) + import requests + response = requests.get(enwik8_url) + with open(enwik8_local_path_zip, "wb") as f: + f.write(response.content) + with zipfile.ZipFile(enwik8_local_path_zip, "r") as zip_ref: + zip_ref.extractall(base_dir) + log0(f"Unzipped enwik8 to {enwik8_local_path}", logger=logger) + enwik8_local_path_zip.unlink() + log0(f"Removed {enwik8_local_path_zip}", logger=logger) + else: + log0(f"Using existing enwik8 at {enwik8_local_path}", logger=logger) + return enwik8_local_path - # if len(buffer) >= _max_char_runs: - store_results(buffer) - buffer.clear() +enwik8_path = enwik8_path() - if buffer: - store_results(buffer) +def enwik8_loader(): + with open(enwik8_path, "r", encoding="utf-8") as f: + return f.read(10**7).split("\n") - if args.compare_truncated_baselines: - comparison_records = [] - for entry in results: - if entry.get("baseline") is not None: - continue - target_vocab_size = entry["vocab_size"] - comparison_records.append({ - "comparison_for": entry.get("tokenizer_name"), - "vocab_size": target_vocab_size, - "pattern": entry.get("pattern"), - "max_chars": entry.get("max_chars"), - "max_bytes": entry.get("max_bytes"), - "truncated_baseline_evaluations": compare_with_truncated_baselines(target_vocab_size), - }) - if comparison_records: - store_results(comparison_records) - - print(f"Total time for all runs: {(time.time() - t_total_start)/3600:.2f} hours.") - print(f"All runs completed. Results stored in {results_path}.") +eval_configs = { + "enwik8": dict( + loader_fn=enwik8_loader, + ), + "HuggingFaceFW/fineweb-edu": dict( + split="train" # no test or validation split available + ), + "HuggingFaceTB/finemath": dict( + split="train", + name=["finemath-3plus"] + ), + "ronantakizawa/github-top-code": dict( + filter_fn=lambda x: x["file_language"] == "Python" # filter for python files only + ), + "HuggingFaceFW/fineweb-2": dict( + name=["fra_Latn", "jpn_Jpan", "kor_Hang", "arb_Arab"], + ) +} +eval_sets = [] +# prepare config to match the expected input of TokenizerCorpus.from_sources +for ds_name, ds_config in eval_configs.items(): + _ds = dict(name=ds_name) + _ds["split"] = ds_config.get("split", "test") + _ds["loader_fn"] = ds_config.get("loader_fn", None) + _ds["generator_source"] = dict(path=ds_name, weight=1.0) + if "filter_fn" in ds_config: + _ds["generator_source"]["filter_fn"] = ds_config["filter_fn"] + if ds_config.get("name", []) == []: + _ds["localdir"] = DATA_DIR / f"corpus/eval_{ds_name.replace('/', '_')}" + _ds["metricname"] = f"{ds_name.split('/')[-1]}" + eval_sets.append(_ds) + else: + for name in ds_config["name"]: + _ds = _ds.copy() + _ds["generator_source"] = _ds["generator_source"].copy() # have to copy to avoid mutating the original for the next iteration + _ds["subset"] = name + _ds["localdir"] = DATA_DIR / f"corpus/eval_{ds_name.replace('/', '_')}:{name}" + _ds["generator_source"]["name"] = name + _ds["metricname"] = f"{ds_name.split('/')[-1]}:{name}" + eval_sets.append(_ds) if __name__ == "__main__": main() \ No newline at end of file From b2a30086d665a0a9e75a04e1b5394903640ae89d Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 21 May 2026 22:10:31 +0200 Subject: [PATCH 15/18] add minimal fixes --- docs/tokenizer_scaling.md | 46 +++------------------- pyproject.toml | 2 +- scripts/benchmark/tokenizer_corpus_size.py | 28 +++++++++++-- src/gpt_lab/model/auto.py | 12 +++++- src/gpt_lab/tokenizer/auto.py | 18 +++++++-- src/gpt_lab/tokenizer/hf.py | 2 +- src/gpt_lab/tokenizer/tokenizer.py | 2 +- src/gpt_lab/utils/schemas.py | 2 +- 8 files changed, 60 insertions(+), 52 deletions(-) diff --git a/docs/tokenizer_scaling.md b/docs/tokenizer_scaling.md index f7a2649..704cefd 100644 --- a/docs/tokenizer_scaling.md +++ b/docs/tokenizer_scaling.md @@ -12,7 +12,7 @@ uv run python -m scripts.benchmark.tokenizer_corpus_size \ --vocab-sizes 20000,50000,100000 \ --pat-strs gpt2,cl100k_base \ --write-corpus \ - --corpus-sizes-mb 10,50,100,500,1000,5000,10000 + --corpus-sizes-gb 10,50,100,500,1000,5000,10000 ``` Args: @@ -21,7 +21,7 @@ Args: - `--vocab-sizes`: Comma-separated list of vocabulary sizes to train tokenizers with. - `--pat-strs`: Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string. - `--write-corpus`: Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk. -- `--corpus-sizes-mb`: Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size. +- `--corpus-sizes-gb`: Comma-separated list of corpus sizes in gigabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size. - `--compare-truncated-baselines`: Whether to compare trained tokenizers with truncated versions of baseline tokenizers. - `--corpus-temperature-alpha`: Optional temperature parameter to control the randomness of the corpus generation. @@ -57,8 +57,7 @@ The evaluation is done on the following datasets (eval_configs in the code): > [!WARNING] > With it current implementation, the script may use the same samples for both training and evaluation, which can lead to overfitting and an overestimation of the tokenizer's performance. -> However, the results obtained were quite poored compared to the baselines, given that I could not reach -the optimal memory budget for training the tokenizers. +> However, the results obtained were quite poor compared to the baselines, given that I could not reach the optimal memory budget for training the tokenizers. > Hence, in case of future runs with **exceptionally good results**, it would be important to check whether the training and evaluation samples are overlapping, and if so, to implement a proper train/eval split to get a more accurate estimate of the tokenizer's performance. @@ -81,53 +80,20 @@ This is mainly motivated by the following facts: - Language model have been scaled up but tokenizers sizes have not been scaled up as much, and it is not clear how much the tokenizer performance can be improved by scaling up the tokenizer training corpus and vocabulary size. - According to [3], Language model performance is sensitive to tokenizer size, and the optimal size is often larger than the commonly used 50-100k tokens, especially for larger models. -## Usage - -How to run it from root directory of the repo: - -options: - -h, --help show the help message and exit - --seed SEED - Random seed for reproducibility. Default is 42. - --num-procs NUM_PROCS - Number of processes to use for tokenizer training. Defaults to the number of CPU cores available, capped at 32 to avoid overloading the system. - --vocab-sizes VOCAB_SIZES - Comma-separated list of vocabulary sizes to train tokenizers with. - --pat-strs PAT_STRS - Comma-separated list of pattern string names to use for tokenizer training. If not specified, defaults to using the GPT-2 pattern string. - --write-corpus - Flag to indicate training mode (write corpus). If not set, the script will attempt to load an existing corpus from disk. - --corpus-sizes-mb CORPUS_SIZES_MB - Comma-separated list of corpus sizes in megabytes to use for tokenizer training. If not specified, defaults to a range of sizes based on the vocabulary size. - --compare-truncated-baselines - Whether to compare trained tokenizers with truncated versions of baseline tokenizers. - --corpus-temperature-alpha CORPUS_TEMPERATURE_ALPHA - Optional temperature parameter to control the randomness of the corpus generation. Higher values will result in a more diverse corpus, while lower values will make it - more focused on the most common samples. This can be useful for testing how the tokenizer performs with different levels of corpus diversity. - --resume - Whether to resume from existing results file. If set, the script will attempt to load existing results from the specified results path and continue from there, - skipping any experiments that have already been completed. This can be useful for long-running experiments that may be interrupted or for iteratively adding new - configurations without re-running everything. - --results-path RESULTS_PATH - Path to store the results of the tokenizer evaluations. Default to './.gpt_lab/tokenizers/scaling_tokenizer_results.pkl'. If a file already exists at this path, it - will be renamed with a number suffix to avoid overwriting previous results. - -- Make a new scaling run with new corpus sizes: - -## Aknowledgements: +## Acknowledgements This experiment is inspired by and has some code adapted from the following sources: - The Hugging Face Tokenizers library (https://github.com/huggingface/tokenizers) - The OpenAI tiktoken library (https://github.com/openai/tiktoken) - nanochat tokenizer code (https://github.com/karpathy/nanochat) for the idea of using HF-training backend + tiktoken-inference backend for efficient training and evaluation of tokenizers. -## References: +## References 1. Reddy, Varshini, et al. "How much is enough? the diminishing returns of tokenization training data." arXiv preprint arXiv:2502.20273 (2025). 2. Zouhar, Vilém, et al. "Tokenization and the noiseless channel." Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2023. 3. Tao, Chaofan, et al. "Scaling laws with vocabulary: Larger models deserve larger vocabularies." Advances in Neural Information Processing Systems 37 (2024): 114147-114179. 4. Karpathy, Andrej. "Let’s Build the GPT Tokenizer: A Complete Guide to Tokenization in LLMs. A text and code version of Karpathy’s famous tokenizer video." https://www.fast.ai/posts/2025-10-16-karpathy-tokenizers.html (2025). -## Contributing: +## Contributing - If you want to contribute to this project, please feel free to open an issue or a pull request. Any contributions are welcome, whether it's fixing a bug, adding a new feature, or improving the documentation. Author: Arthur Testard (arthur.testard.pro@gmail.com) \ diff --git a/pyproject.toml b/pyproject.toml index 801183b..0e077d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ ] dependencies = [ + "datasets==4.8.4", "gradio==3.18.0", "jinja2==3.1.6", "kernels==0.11.7", @@ -43,7 +44,6 @@ gpu = [ [dependency-groups] dev = [ "pytest==8.0", - "datasets==4.8.4", "huggingface-hub==1.12.0", "matplotlib==3.10.8", ] diff --git a/scripts/benchmark/tokenizer_corpus_size.py b/scripts/benchmark/tokenizer_corpus_size.py index 9ef81d4..ccf116a 100644 --- a/scripts/benchmark/tokenizer_corpus_size.py +++ b/scripts/benchmark/tokenizer_corpus_size.py @@ -277,13 +277,18 @@ def main(): t_total_start = time.time() with ProcessPoolExecutor(max_workers=meta.get("num_procs")) as executor: - futures = [executor.submit(run_tokenizer_experiment, t) for t in tasks] + future_to_name = {} + futures = [] + for t in tasks: + future = executor.submit(run_tokenizer_experiment, t) + futures.append(future) + future_to_name[future] = t[0] buffer = [] for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Tokenizer experiments")): result = future.result() - _ = None # placeholder for run name or other metadata if needed - buffer.append((result, _)) + run_name = future_to_name[future] + buffer.append((result, run_name)) if len(buffer) >= args.save_every: store_buffered_results(buffer, result_dir) @@ -308,6 +313,19 @@ def load_all_result_paths(result_dir: Path) -> List[str]: continue return result_paths +def _normalize_result_run_name(run_name: str) -> str: + if not isinstance(run_name, str): + raise ValueError("run_name must be a non-empty string filename") + normalized_run_name = run_name.strip() + if not normalized_run_name: + raise ValueError("run_name must be a non-empty string filename") + run_path = Path(normalized_run_name) + if run_path.name != normalized_run_name or normalized_run_name in {".", ".."}: + raise ValueError("run_name must be a filename without directory components") + if run_path.suffix != ".pkl": + normalized_run_name = f"{normalized_run_name}.pkl" + return normalized_run_name + def store_single_result(result, path): with open(path, "wb") as f: pickle.dump(result, f) @@ -315,6 +333,8 @@ def store_single_result(result, path): def store_buffered_results(buffer, result_dir): for result, run_name in buffer: path = result_dir / run_name + safe_run_name = _normalize_result_run_name(run_name) + path = result_dir / safe_run_name store_single_result(result, path) buffer.clear() @@ -366,7 +386,7 @@ def get_eval_corpus(eval_set): def eval_tokenizer(tokenizer): results = {} - _counter.update(tokens) + _counter = Counter() _token_len_cache = { tok: len(tokenizer.decode([tok])) for tok in range(tokenizer.n_vocab) diff --git a/src/gpt_lab/model/auto.py b/src/gpt_lab/model/auto.py index ed435ed..d4988e0 100644 --- a/src/gpt_lab/model/auto.py +++ b/src/gpt_lab/model/auto.py @@ -164,7 +164,17 @@ def compute_optimal_vocab_size(depth: int) -> int: f"{pat_str} on corpus from {str(DATA_DIR / 'corpus' / self.name)}. This may take a while...", logger=logger, level="warning") - tokenizer = build_or_load_tokenizer(self.tokenizer_model, int(vocab_size), True, _tname, PAT_STR.get(pat_str, "gpt2"), special_tokens, DATA_DIR / "corpus" / self.name, self.random_seed, dirname=self.dirname) + tokenizer = build_or_load_tokenizer( + tname=self.tokenizer_model, + vocab_size=int(vocab_size), + train_tokenizer=True, + base_name=_tname, + pattern=PAT_STR.get(pat_str, "gpt2"), + special_tokens=special_tokens, + corpus_dir=DATA_DIR / "corpus" / self.name, + random_seed=self.random_seed, + dirname=self.dirname + ) param_counts = model.n_params_per_layer() diff --git a/src/gpt_lab/tokenizer/auto.py b/src/gpt_lab/tokenizer/auto.py index 0bad08e..570cc93 100644 --- a/src/gpt_lab/tokenizer/auto.py +++ b/src/gpt_lab/tokenizer/auto.py @@ -64,7 +64,7 @@ def build_meta_model_from_depth(d: int, vocab_size: int = -1): vocab_size = round(opt_vocab_size / step) * step if vocab_size < 256: - raise ValueError("Computed optimal vocab size is <256; increase model size or set vocab_size explicitly.") + raise ValueError(f"Computed optimal vocab size {vocab_size} is <256; increase model size or set vocab_size explicitly.") return int(vocab_size) + len(special_tokens.list()) @@ -80,15 +80,27 @@ def resolve_tokenizer(name: Optional[str], vocab_size: int, special_tokens: Spec return get_closest_tokenizer_size(vocab_size)[0] -def build_or_load_tokenizer(tname: Optional[str], vocab_size: int, train_tokenizer: bool, base_name: str, pat_str: str, special_tokens: SpecialTokens, data_dir, random_seed: int, dirname=None): +def build_or_load_tokenizer( + name: Optional[str], + vocab_size: int, + train_tokenizer: bool, + base_name: str, + pat_str: str, + special_tokens: SpecialTokens, + data_dir, + random_seed: int, + dirname=None + ) -> Tokenizer: """Orchestrate loading or training of a tokenizer. - If `not train_tokenizer`, attempt to load a pretrained tokenizer. - Else, train a new tokenizer using the corpus and `TokenizerTrainerConfig`. Returns a `Tokenizer` instance. """ + if name in ("auto", None): + name = resolve_tokenizer(name, vocab_size, special_tokens) if not train_tokenizer: - name_or_choice = tname or resolve_tokenizer(tname, vocab_size, special_tokens) + name_or_choice = name or resolve_tokenizer(name, vocab_size, special_tokens) try: return Tokenizer.from_pretrained(name_or_choice) except Exception as e: diff --git a/src/gpt_lab/tokenizer/hf.py b/src/gpt_lab/tokenizer/hf.py index 25c5737..ece0053 100644 --- a/src/gpt_lab/tokenizer/hf.py +++ b/src/gpt_lab/tokenizer/hf.py @@ -115,7 +115,7 @@ def train_huggingface_from_iterator(text_iterator: Iterable[str], config: Tokeni # Prefer training-specific params container when available _special_tokens = list(config.special_tokens) or [] - vocab_size_no_special = config.vocab_size - len(config.special_tokens) + vocab_size_no_special = config.vocab_size - len(_special_tokens) trainer = BpeTrainer( vocab_size=vocab_size_no_special, show_progress=True, diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index afb30b0..2deb335 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -398,7 +398,7 @@ def update_token_bytes(self): log0(f"Updated token bytes after truncation from {old_vocab_size:,} to {self.vocab_size:,}", logger=logger) # Save token_bytes to disk token_bytes_path = Path(self.config.dirname) / "token_bytes.pt" - torch.save(self._token_bytes, token_bytes_path) + torch.save(self.token_bytes, token_bytes_path) # Persist tokenizer config/metadata try: self.config.save_to_directory() diff --git a/src/gpt_lab/utils/schemas.py b/src/gpt_lab/utils/schemas.py index ab5524f..d4a4372 100644 --- a/src/gpt_lab/utils/schemas.py +++ b/src/gpt_lab/utils/schemas.py @@ -171,7 +171,7 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): if not directory.name == cleaned_name: # add model name to path if not already included directory = directory / cleaned_name config_path = directory / "config.pkl" - config_path.mkdir(parents=True, exist_ok=True) + config_path.parent.mkdir(parents=True, exist_ok=True) with open(str(config_path), "wb") as f: pickle.dump(self, f) From 5385b95d71dcfe1439961e2f36a88788e55592b6 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 21 May 2026 22:13:12 +0200 Subject: [PATCH 16/18] fix test with wrong tokenizer.auto.build_or_load_tokenizer signature --- tests/test_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 8fc9219..b1292ae 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -264,7 +264,7 @@ def test_build_or_load_tokenizer_notrain_uses_pretrained(monkeypatch): ) out = tokenizer_auto.build_or_load_tokenizer( - tname="gpt2", + name="gpt2", vocab_size=32000, train_tokenizer=False, base_name="unused", @@ -296,7 +296,7 @@ def iterator(self): ) out = tokenizer_auto.build_or_load_tokenizer( - tname=None, + name=None, vocab_size=4096, train_tokenizer=True, base_name="my_tok", From 600676981a9bc8cf6cf252b60f08c727afdf247e Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 21 May 2026 22:21:04 +0200 Subject: [PATCH 17/18] tokenizer: readme --- README.md | 86 ++++++++++++++++++++++++++----------------------------- uv.lock | 4 +-- 2 files changed, 42 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 8c25a1c..9745ab6 100644 --- a/README.md +++ b/README.md @@ -238,23 +238,22 @@ meta_config = cfg.generate_gpt_config(device="cuda") Next sections detail the different generated components. -### Tokenization - - -The tokenization implementation are located in [`gpt_lab.tokenizer`](./src/gpt_lab/tokenizer/tokenizer.py). The code only includes BPE tokenization for now (include sentencepiece is a TODO). The tokenizer training is only supported by huggingface implementation for now. For inference, the tiktoken implementation is the default one, as it is much faster than the huggingface one. The custom BPE implementation is still under development, and is not functional yet. - -#### Training a tokenizer - -```python -from gpt_lab.tokenizer import Tokenizer -from gpt_lab.tokenizer.corpus import TokenizerCorpus -from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig - -# uses default corpus settings (mixture of HuggingFaceFW/fineweb-edu, HuggingFaceFW/fineweb-2, HuggingFaceTB/finemath and codeparrot/codeparrot-clean) -corpus = TokenizerCorpus.from_sources(random_seed=42) -trainer_cfg = TokenizerTrainerConfig( - source="huggingface", # training backend (e.g., "huggingface", "tiktoken", "bpe", "fbpe", "rbpe", "dummy") - to_save=True, # pattern for pre-tokenization (e.g., "gpt2", "cl100k-base", etc., or regex pattern for custom pre-tokenization) -) -cfg = TokenizerConfig( - name="my_tokenizer", - vocab_size=32_000, - pat_str="gpt2", - trainer=trainer_cfg,# whether to save the trained tokenizer to disk -) -tokenizer = Tokenizer.train_from_iterator(cfg, iterator=corpus.iterator()) -``` +> Pretrained and truncated tokenizers are fully deterministic. #### Using a pre-trained tokenizer @@ -383,9 +359,27 @@ The tokenizer training script is located in `scripts/train_tokenizer.py`. It all Training time benchmarks for different implementations and configurations. All the tokenizers were trained on corpus generated from `gpt_lab.tokenizer.corpus.TokenizerCorpus()` with default settings, tuned with variable `vocab_size`. - +#### Scaling laws for tokenizer training + +In [docs/tokenizer_scaling.md](./docs/tokenizer_scaling.md), we analyze how ByteLevel BPE tokenization scales with different corpus sizes, vocabulary sizes, and split patterns. The goal is to understand the trade-offs between these factors and their impact on tokenization quality and efficiency. + +To experiment yourself tokenizer scaling, you can run the following command from the root directory of the repo: + +```bash +uv run python -m scripts.benchmark.tokenizer_corpus_size \ + --seed 42 \ + --num-procs 16 \ + --vocab-sizes 20000,50000,100000 \ + --pat-strs gpt2,cl100k_base \ + --write-corpus \ + --corpus-sizes-gb 10,50,100,500,1000,5000,10000 +``` + +More details on the arguments are given in [tokenizer_corpus_size.py](./scripts/benchmark/tokenizer_corpus_size.py) or using `--help`: + +```bash +uv run python -m scripts.benchmark.tokenizer_corpus_size --help +``` ### Model architecture diff --git a/uv.lock b/uv.lock index 47faa39..0e87f21 100644 --- a/uv.lock +++ b/uv.lock @@ -573,6 +573,7 @@ name = "gpt-lab" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "datasets" }, { name = "gradio" }, { name = "jinja2" }, { name = "kernels" }, @@ -606,7 +607,6 @@ gpu = [ [package.dev-dependencies] dev = [ - { name = "datasets" }, { name = "huggingface-hub" }, { name = "matplotlib" }, { name = "pytest" }, @@ -618,6 +618,7 @@ notebook = [ [package.metadata] requires-dist = [ + { name = "datasets", specifier = "==4.8.4" }, { name = "gradio", specifier = "==3.18.0" }, { name = "jinja2", specifier = "==3.1.6" }, { name = "kernels", specifier = "==0.11.7" }, @@ -642,7 +643,6 @@ provides-extras = ["cpu", "gpu"] [package.metadata.requires-dev] dev = [ - { name = "datasets", specifier = "==4.8.4" }, { name = "huggingface-hub", specifier = "==1.12.0" }, { name = "matplotlib", specifier = "==3.10.8" }, { name = "pytest", specifier = "==8.0" }, From 083f2f24544447d50f7c99d22bf17e051ec31300 Mon Sep 17 00:00:00 2001 From: Arthur Testard Date: Thu, 21 May 2026 22:42:01 +0200 Subject: [PATCH 18/18] tokenizer: readme + minor fixes --- README.md | 5 ++--- docs/tokenizer_scaling.md | 12 ++++++++++++ src/gpt_lab/model/auto.py | 6 +++--- src/gpt_lab/tokenizer/corpus.py | 13 +++++++++---- src/gpt_lab/tokenizer/hf.py | 2 +- src/gpt_lab/tokenizer/tokenizer.py | 4 ++-- src/gpt_lab/utils/logging.py | 4 ++-- 7 files changed, 31 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 9745ab6..f4f03f0 100644 --- a/README.md +++ b/README.md @@ -281,8 +281,7 @@ tokenizer is loaded, truncated, saved, and returned. tokenizer = Tokenizer.from_pretrained("cl100k_base_truncated_32000") ``` -Truncation always preserves all 256 byte-level tokens and reassigns ranks -to be contiguous from 0. +Truncation always preserves all 256 byte-level tokens and reassigns ranks to be contiguous from 0. #### Training a tokenizer @@ -303,7 +302,7 @@ cfg = TokenizerConfig( pat_str="gpt2", trainer=trainer_cfg,# whether to save the trained tokenizer to disk ) -tokenizer = Tokenizer.train_from_iterator(cfg, iterator=corpus.iterator()) +tokenizer = Tokenizer.train_from_iterator(iterator=corpus.iterator(), config=trainer_cfg) ``` diff --git a/docs/tokenizer_scaling.md b/docs/tokenizer_scaling.md index 704cefd..169e7f9 100644 --- a/docs/tokenizer_scaling.md +++ b/docs/tokenizer_scaling.md @@ -80,6 +80,18 @@ This is mainly motivated by the following facts: - Language model have been scaled up but tokenizers sizes have not been scaled up as much, and it is not clear how much the tokenizer performance can be improved by scaling up the tokenizer training corpus and vocabulary size. - According to [3], Language model performance is sensitive to tokenizer size, and the optimal size is often larger than the commonly used 50-100k tokens, especially for larger models. +## About the Evaluation + +The evaluation is done on different datasets; which can be easily customized in the code (making it easy to add new evaluation datasets is a TODO), with the following metrics: +- **Compression ratio**: the ratio of the number of tokens produced by the tokenizer to the number of characters in the input text. +- **Efficiency**: the average number of characters per token, which is the inverse of the compression ratio. +- **Rényi entropy**: introduced in [2], it is a generalization of the Shannon entropy that can be used to measure the diversity of the token distribution. It is defined as: + $$H_\alpha(X) = \frac{1}{1-\alpha} \log \sum_{i=1}^n p_i^\alpha$$ + where $p_i$ is the probability of the $i$-th token in the distribution, and $\alpha$ is a parameter that controls the sensitivity of the entropy to the probabilities of the tokens. When $\alpha \to 1$, the Rényi entropy converges to the Shannon entropy, which is the most commonly used measure of entropy. When $\alpha > 1$, the Rényi entropy is more sensitive to the probabilities of the most common tokens, while when $\alpha < 1$, it is more sensitive to the probabilities of the less common tokens. In our experiments, we use $\alpha = 2.5$, which is a common choice in the literature for measuring the diversity of token distributions. +- **Efficient Entropy**: also introduced in [2], it is a the Rényi entropy with $\alpha = 2.5$ scaled by the number of tokens: + $$H_\alpha^{\text{eff}}(X) = \frac{H_\alpha(X)}{\log n}$$ + where $n$ is the number of tokens in the vocabulary. The efficient entropy is a measure of the diversity of the token distribution that takes into account the size of the vocabulary. It is defined as the Rényi entropy scaled by the logarithm of the number of tokens in the vocabulary, which allows us to compare tokenizers with different vocabulary sizes on a more equal footing. + ## Acknowledgements This experiment is inspired by and has some code adapted from the following sources: diff --git a/src/gpt_lab/model/auto.py b/src/gpt_lab/model/auto.py index d4988e0..c2367ac 100644 --- a/src/gpt_lab/model/auto.py +++ b/src/gpt_lab/model/auto.py @@ -165,13 +165,13 @@ def compute_optimal_vocab_size(depth: int) -> int: logger=logger, level="warning") tokenizer = build_or_load_tokenizer( - tname=self.tokenizer_model, + name=self.tokenizer_model, vocab_size=int(vocab_size), train_tokenizer=True, base_name=_tname, - pattern=PAT_STR.get(pat_str, "gpt2"), + pat_str=PAT_STR.get(pat_str, "gpt2"), special_tokens=special_tokens, - corpus_dir=DATA_DIR / "corpus" / self.name, + data_dir=DATA_DIR / "corpus" / self.name, random_seed=self.random_seed, dirname=self.dirname ) diff --git a/src/gpt_lab/tokenizer/corpus.py b/src/gpt_lab/tokenizer/corpus.py index 87c8155..02348fa 100644 --- a/src/gpt_lab/tokenizer/corpus.py +++ b/src/gpt_lab/tokenizer/corpus.py @@ -9,10 +9,7 @@ # TODO: consider using compression.ztsd when python.version >= 3.14 (pi) from tqdm import tqdm -try: - from datasets import load_dataset -except ImportError: - load_dataset = None +from datasets import load_dataset import logging @@ -51,6 +48,14 @@ def apply_temperature_sampling( scaled = [w ** alpha for w in raw] total = sum(scaled) + if total == 0: + if not sources: + return sources + uniform_weight = 1.0 / len(sources) + for src in sources: + src["weight"] = uniform_weight + return sources + for src, w in zip(sources, scaled): src["weight"] = w / total diff --git a/src/gpt_lab/tokenizer/hf.py b/src/gpt_lab/tokenizer/hf.py index ece0053..bf374c7 100644 --- a/src/gpt_lab/tokenizer/hf.py +++ b/src/gpt_lab/tokenizer/hf.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Iterable, Dict -from gpt_lab.utils.logging import log_all, log0, log_error +from gpt_lab.utils.logging import log0, log_error from gpt_lab.utils.schemas import TokenizerConfig, TokenizerTrainerConfig from gpt_lab.utils.special_tokens import SpecialTokens from gpt_lab.tokenizer.base import _BaseTokenizer diff --git a/src/gpt_lab/tokenizer/tokenizer.py b/src/gpt_lab/tokenizer/tokenizer.py index 2deb335..25192f2 100644 --- a/src/gpt_lab/tokenizer/tokenizer.py +++ b/src/gpt_lab/tokenizer/tokenizer.py @@ -424,8 +424,8 @@ def save_to_directory(self, directory: Optional[Union[str, Path]] = None): # Save token bytes tensor token_bytes_path = directory / "token_bytes.pt" - torch.save(self._token_bytes, token_bytes_path) - + torch.save(self.token_bytes, token_bytes_path) + # Write a lightweight JSON descriptor alongside the pickle config for readability config_json = { "name": self.config.name, diff --git a/src/gpt_lab/utils/logging.py b/src/gpt_lab/utils/logging.py index faacad2..d6f5f7a 100644 --- a/src/gpt_lab/utils/logging.py +++ b/src/gpt_lab/utils/logging.py @@ -111,11 +111,11 @@ def log_critical(message, error_type=RuntimeError, logger=logger): logger.critical(_with_rank(message), stacklevel=3) raise error_type(message) -def log_all(msg, level=logging.ERROR, logger=logger): +def log_all(msg, level=logging.ERROR, logger=logger, raise_error: bool = False): if isinstance(level, str): level = log_levels.get(level.upper(), logging.ERROR) logger.log(level, _with_rank(msg), stacklevel=3) - if level >= logging.ERROR: + if level >= logging.ERROR and raise_error: raise RuntimeError(msg) def log_dict(title, info, logger=logger, level=logging.INFO, only_rank0=True, structured=False):