tiktoken-cli/tiktoken_cli.py at master · adev0x/tiktoken-cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
"""Count OpenAI tokens from the command line."""

import argparse
import sys

try:
    import tiktoken
except ImportError:
    print("Error: tiktoken is required. Install with: pip install tiktoken")
    sys.exit(1)

DEFAULT_MODEL = "gpt-4"

ENCODINGS = ["cl100k_base", "p50k_base", "p50k_edit", "r50k_base"]


def count_tokens(text, model=DEFAULT_MODEL):
    try:
        enc = tiktoken.encoding_for_model(model)
    except KeyError:
        enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    return tokens, enc.name


def format_count(text, model, show_ids=False):
    tokens, encoding_name = count_tokens(text, model)
    n = len(tokens)
    chars = len(text)
    ratio = chars / n if n > 0 else 0

    lines = [
        f"Model:    {model}",
        f"Encoding: {encoding_name}",
        f"Tokens:   {n:,}",
        f"Chars:    {chars:,}",
        f"Ratio:    {ratio:.1f} chars/token",
    ]

    if show_ids:
        lines.append(f"IDs:      {tokens[:50]}{'...' if n > 50 else ''}")

    return "\n".join(lines)


def compare_encodings(text):
    print(f"Text: {text[:80]}{'...' if len(text) > 80 else ''}")
    print(f"Chars: {len(text):,}")
    print()
    for name in ENCODINGS:
        try:
            enc = tiktoken.get_encoding(name)
            tokens = enc.encode(text)
            print(f"  {name:<16s} {len(tokens):>8,} tokens")
        except Exception:
            pass


def main():
    parser = argparse.ArgumentParser(description="Count OpenAI tokens")
    parser.add_argument("text", nargs="?", help="text to tokenize")
    parser.add_argument("-f", "--file", help="read text from file")
    parser.add_argument("-m", "--model", default=DEFAULT_MODEL, help="model name (default: gpt-4)")
    parser.add_argument("--ids", action="store_true", help="show token IDs")
    parser.add_argument("--compare", action="store_true", help="compare across encodings")

    args = parser.parse_args()

    if args.file:
        with open(args.file) as f:
            text = f.read()
    elif args.text:
        text = args.text
    elif not sys.stdin.isatty():
        text = sys.stdin.read()
    else:
        parser.print_help()
        sys.exit(1)

    if args.compare:
        compare_encodings(text)
    else:
        print(format_count(text, args.model, args.ids))


if __name__ == "__main__":
    main()