-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtiktoken_cli.py
More file actions
88 lines (68 loc) · 2.31 KB
/
tiktoken_cli.py
File metadata and controls
88 lines (68 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
"""Count OpenAI tokens from the command line."""
import argparse
import sys
try:
import tiktoken
except ImportError:
print("Error: tiktoken is required. Install with: pip install tiktoken")
sys.exit(1)
DEFAULT_MODEL = "gpt-4"
ENCODINGS = ["cl100k_base", "p50k_base", "p50k_edit", "r50k_base"]
def count_tokens(text, model=DEFAULT_MODEL):
try:
enc = tiktoken.encoding_for_model(model)
except KeyError:
enc = tiktoken.get_encoding("cl100k_base")
tokens = enc.encode(text)
return tokens, enc.name
def format_count(text, model, show_ids=False):
tokens, encoding_name = count_tokens(text, model)
n = len(tokens)
chars = len(text)
ratio = chars / n if n > 0 else 0
lines = [
f"Model: {model}",
f"Encoding: {encoding_name}",
f"Tokens: {n:,}",
f"Chars: {chars:,}",
f"Ratio: {ratio:.1f} chars/token",
]
if show_ids:
lines.append(f"IDs: {tokens[:50]}{'...' if n > 50 else ''}")
return "\n".join(lines)
def compare_encodings(text):
print(f"Text: {text[:80]}{'...' if len(text) > 80 else ''}")
print(f"Chars: {len(text):,}")
print()
for name in ENCODINGS:
try:
enc = tiktoken.get_encoding(name)
tokens = enc.encode(text)
print(f" {name:<16s} {len(tokens):>8,} tokens")
except Exception:
pass
def main():
parser = argparse.ArgumentParser(description="Count OpenAI tokens")
parser.add_argument("text", nargs="?", help="text to tokenize")
parser.add_argument("-f", "--file", help="read text from file")
parser.add_argument("-m", "--model", default=DEFAULT_MODEL, help="model name (default: gpt-4)")
parser.add_argument("--ids", action="store_true", help="show token IDs")
parser.add_argument("--compare", action="store_true", help="compare across encodings")
args = parser.parse_args()
if args.file:
with open(args.file) as f:
text = f.read()
elif args.text:
text = args.text
elif not sys.stdin.isatty():
text = sys.stdin.read()
else:
parser.print_help()
sys.exit(1)
if args.compare:
compare_encodings(text)
else:
print(format_count(text, args.model, args.ids))
if __name__ == "__main__":
main()