Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 61 additions & 14 deletions scripts/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,62 @@


css = """
@media (prefers-color-scheme: dark) {
.tokenizer-token{
cursor: pointer;
}
.tokenizer-token-0 {background: rgba(255, 0, 0, 0.05);}
.tokenizer-token-0:hover {background: rgba(255, 0, 0, 0.15);}
.tokenizer-token-1 {background: rgba(0, 255, 0, 0.05);}
.tokenizer-token-1:hover {background: rgba(0, 255, 0, 0.15);}
.tokenizer-token-2 {background: rgba(0, 0, 255, 0.05);}
.tokenizer-token-2:hover {background: rgba(0, 0, 255, 0.15);}
.tokenizer-token-3 {background: rgba(255, 156, 0, 0.05);}
.tokenizer-token-3:hover {background: rgba(255, 156, 0, 0.15);}
.tokenizer-token-0 {background: rgba(255, 0, 0, 0.2);}
.tokenizer-token-0:hover {background: rgba(255, 0, 0, 0.4);}
.tokenizer-token-1 {background: rgba(0, 255, 0, 0.2);}
.tokenizer-token-1:hover {background: rgba(0, 255, 0, 0.4);}
.tokenizer-token-2 {background: rgba(0, 0, 255, 0.2);}
.tokenizer-token-2:hover {background: rgba(0, 0, 255, 0.4);}
.tokenizer-token-3 {background: rgba(255, 156, 0, 0.2);}
.tokenizer-token-3:hover {background: rgba(255, 156, 0, 0.4);}
}
@media (prefers-color-scheme: light) {
.tokenizer-token{
cursor: pointer;
}
.tokenizer-token-0 {background: rgba(255, 0, 0, 0.1);}
.tokenizer-token-0:hover {background: rgba(255, 0, 0, 0.2);}
.tokenizer-token-1 {background: rgba(0, 255, 0, 0.1);}
.tokenizer-token-1:hover {background: rgba(0, 255, 0, 0.2);}
.tokenizer-token-2 {background: rgba(0, 0, 255, 0.1);}
.tokenizer-token-2:hover {background: rgba(0, 0, 255, 0.2);}
.tokenizer-token-3 {background: rgba(255, 156, 0, 0.1);}
.tokenizer-token-3:hover {background: rgba(255, 156, 0, 0.2);}
}
"""


def tokenize(text, input_is_ids=False):
def tokenize(text, current_step=1, total_step=1, AND_block=0, simple_input=False, input_is_ids=False):
clip: FrozenCLIPEmbedder = shared.sd_model.cond_stage_model.wrapped

token_count = None
if input_is_ids:
tokens = [int(x.strip()) for x in text.split(",")]
else:
elif simple_input:
tokens = clip.tokenizer(text, truncation=False, add_special_tokens=False)["input_ids"]
else:
from modules import sd_hijack, prompt_parser
from functools import reduce
_, prompt_flat_list, _ = prompt_parser.get_multicond_prompt_list([text])
prompt_schedules = prompt_parser.get_learned_conditioning_prompt_schedules(prompt_flat_list, int(total_step))
flat_prompts = reduce(lambda list1, list2: list1+list2, prompt_schedules)
prompts = [prompt_text for step, prompt_text in flat_prompts]

def find_current_prompt_idx(c_step, a_block):
_idx = 0
for i, prompts_block in enumerate(prompt_schedules):
for step_prompt_chunk in prompts_block:
if i == a_block:
if c_step <= step_prompt_chunk[0]:
return _idx
_idx += 1

idx = find_current_prompt_idx(current_step, AND_block)
tokens, token_count, max_length = [sd_hijack.model_hijack.tokenize(prompt) for prompt in prompts][idx]

vocab = {v: k for k, v in clip.tokenizer.get_vocab().items()}

Expand All @@ -44,7 +79,10 @@ def dump(last=False):

def wordscode(ids, word):
nonlocal class_index
res = f"""<span class='tokenizer-token tokenizer-token-{class_index%4}' title='{html.escape(", ".join([str(x) for x in ids]))}'>{html.escape(word)}</span>"""
if ids != [clip.tokenizer.eos_token_id]:
res = f"""<span class='tokenizer-token tokenizer-token-{class_index%4}' title='{html.escape(", ".join([str(x) for x in ids]))}'>{html.escape(word)}</span>"""
else:
res = f"""<span class='tokenizer-token tokenizer-token-4' title='{html.escape(", ".join([str(x) for x in ids]))}'>{html.escape(word)}</span>"""
class_index += 1
return res

Expand Down Expand Up @@ -83,9 +121,11 @@ def wordscode(ids, word):

dump(last=True)

if token_count is None:
token_count = len(ids)
ids_html = f"""
<p>
Token count: {len(ids)}<br>
Token count: {token_count}/{len(ids)}<br>
{", ".join([str(x) for x in ids])}
</p>
"""
Expand All @@ -98,14 +138,21 @@ def add_tab():
gr.HTML(f"""
<style>{css}</style>
<p>
Before your text is sent to the neural network, it gets turned into numbers in a process called tokenization. These tokens are how the neural network reads and interprets text. Thanks to our great friends at Shousetsu愛 for inspiration for this feature.
Before your text is sent to the neural network, it gets turned into numbers in a process called tokenization. These tokens are how the neural network reads and interprets text. Thanks to our great friends at Shousetsu愛 for inspiration for this feature.<br>
Depending on your setting, text will be first parsed by webui to calculate prompt attention like (text) and [text], and scheduler like [a:b:0.5], and the capital AND like a AND b before tokenization. This extension processes your text like this as well.<br>
To disable this feature, check on "Don't parse webui special grammar".
</p>
""")

with gr.Tabs() as tabs:
with gr.Tab("Text input", id="input_text"):
prompt = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Prompt for tokenization")
go = gr.Button(value="Tokenize", variant="primary")
is_simple = gr.Checkbox(label="Don't parse webui special grammar", interactive=True)
with gr.Row():
current_step = gr.Number(label='Current sampling steps', value=1, step=1, interactive=True)
total_step = gr.Number(label='Total sampling steps', value=28, step=1, interactive=True)
and_block = gr.Number(label='Which block of prompts (separated by AND) to tokenize', value=0, step=1, interactive=True)

with gr.Tab("ID input", id="input_ids"):
prompt_ids = gr.Textbox(label="Prompt", elem_id="tokenizer_prompt", show_label=False, lines=8, placeholder="Ids for tokenization (example: 9061, 631, 736)")
Expand All @@ -120,7 +167,7 @@ def add_tab():

go.click(
fn=tokenize,
inputs=[prompt],
inputs=[prompt, current_step, total_step, and_block, is_simple],
outputs=[tokenized_text, tokens],
)

Expand Down