Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 27 additions & 46 deletions eval_protocol/human_id/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,85 +12,66 @@
def generate_id(
separator: str = "-",
seed: int | float | str | bytes | bytearray | None = None,
word_count: int = 5,
index: int | None = None,
) -> str:
"""
Generate a human readable ID
Generate a human readable ID in format: adjective-noun-NN

:param separator: The string to use to separate words
:param seed: The seed to use. The same seed will produce the same ID or index-based mapping
:param index: Optional non-negative integer providing a 1:1 mapping to an ID.
When provided, the mapping is deterministic and bijective for
all integers in range [0, total_combinations).
:param word_count: The number of words to use. Minimum of 3.
:return: A human readable ID
"""
if word_count < 3:
raise ValueError("word_count cannot be lower than 3")

# If a specific index is provided, use mixed-radix encoding into a fixed
# sequence of parts to guarantee a bijection between integers and IDs.
# The sequence cycles as: verb, adjective, noun, verb, adjective, noun, ...
# If a specific index is provided, use it for deterministic generation
if index is not None:
if not isinstance(index, int) or index < 0:
raise ValueError("index must be a non-negative integer if provided")

# Prepare category lists; if seed is provided, shuffle deterministically
base_categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
if seed is not None:
rnd = random.Random(seed)
categories = [tuple(rnd.sample(cat, len(cat))) for cat in base_categories]
adjectives = tuple(rnd.sample(dictionary.adjectives, len(dictionary.adjectives)))
nouns = tuple(rnd.sample(dictionary.nouns, len(dictionary.nouns)))
else:
categories = base_categories
# Build the category order for the desired word_count
ordered_categories = [categories[i % 3] for i in range(word_count)]
adjectives = dictionary.adjectives
nouns = dictionary.nouns

# Compute total number of combinations for this word_count
radices = [len(cat) for cat in ordered_categories]
total = num_combinations(word_count)
# Calculate total combinations: adjectives * nouns * 100 (for 00-99)
total = len(adjectives) * len(nouns) * 100

if index >= total:
raise ValueError(f"index out of range for given word_count. Received {index}, max allowed is {total - 1}")
raise ValueError(f"index out of range. Received {index}, max allowed is {total - 1}")

# Mixed-radix decomposition (least significant position is the last word)
digits: list[int] = []
remaining = index
for base in reversed(radices):
digits.append(remaining % base)
remaining //= base
digits.reverse()
# Decompose index into adjective, noun, and number
number = index % 100
remaining = index // 100
noun_idx = remaining % len(nouns)
adj_idx = remaining // len(nouns)

words = [ordered_categories[pos][digits[pos]] for pos in range(word_count)]
return separator.join(words)
adjective = adjectives[adj_idx]
noun = nouns[noun_idx]

return f"{adjective}{separator}{noun}{separator}{number:02d}"

# Random generation
random_obj = system_random
if seed is not None:
random_obj = random.Random(seed)

parts = {dictionary.verbs: 1, dictionary.adjectives: 1, dictionary.nouns: 1}

for _ in range(3, word_count):
parts[random_obj.choice(list(parts.keys()))] += 1

parts = itertools.chain.from_iterable(random_obj.sample(part, count) for part, count in parts.items())
adjective = random_obj.choice(dictionary.adjectives)
noun = random_obj.choice(dictionary.nouns)
number = random_obj.randint(0, 99)

return separator.join(parts)
return f"{adjective}{separator}{noun}{separator}{number:02d}"


def num_combinations(word_count: int = 5) -> int:
def num_combinations() -> int:
"""
Return the total number of unique IDs possible for the given word_count.
Return the total number of unique IDs possible.

The sequence of categories cycles as: verb, adjective, noun, then repeats.
This value can be used to mod an index when calling generate_id(index=...).
Format uses adjective-noun-NN, so total = adjectives * nouns * 100.
"""
if word_count < 3:
raise ValueError("word_count cannot be lower than 3")

categories = [dictionary.verbs, dictionary.adjectives, dictionary.nouns]
radices = [len(categories[i % 3]) for i in range(word_count)]
total = 1
for r in radices:
total *= r
return total
return len(dictionary.adjectives) * len(dictionary.nouns) * 100
115 changes: 70 additions & 45 deletions tests/test_human_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,91 @@
from eval_protocol.human_id import generate_id, num_combinations


def test_generate_id_index_basic_3_words():
# index 0 maps to the first element of each category (verb, adjective, noun)
assert generate_id(index=0, word_count=3) == "be-other-time"

# incrementing index advances the least-significant position (noun)
assert generate_id(index=1, word_count=3) == "be-other-year"

# carry into the adjective when nouns wrap
# index == len(nouns) => adjective advances by 1, noun resets
# nouns length inferred by probing with large indices is brittle; instead, compute via reach
# We know index=0 gives be-other-time, and index that produces adjective=new, noun=time should be reachable.
# Derive by scanning forward until adjective changes to 'new'. This keeps test robust to dictionary size edits.
base = generate_id(index=0, word_count=3)
# Find the first index where adjective becomes 'new' and noun resets to 'time'
target = None
for i in range(1, 2000):
cand = generate_id(index=i, word_count=3)
if cand.startswith("be-new-time"):
target = i
break
assert target is not None, "Expected to find carry into adjective within search bound"
assert generate_id(index=target, word_count=3) == "be-new-time"


def test_generate_id_index_word_count_cycle():
# word_count cycles categories: verb, adj, noun, verb, adj, ...
assert generate_id(index=0, word_count=5) == "be-other-time-be-other"
# increment least-significant position (adj at position 5)
assert generate_id(index=1, word_count=5) == "be-other-time-be-new"


def test_generate_id_index_out_of_range_and_negative():
# Use exported total combinations for clean boundary checks
total = num_combinations(word_count=3)
def test_generate_id_basic_format():
"""Test that generate_id produces the expected adjective-noun-NN format"""
id_str = generate_id(index=0)
# Should match pattern: adjective-noun-NN where NN is 00-99
assert re.match(r"^[a-z]+-[a-z]+-\d{2}$", id_str)

# Test a few specific indices to ensure deterministic behavior
assert generate_id(index=0) == "other-time-00"
assert generate_id(index=1) == "other-time-01"
assert generate_id(index=99) == "other-time-99"
assert generate_id(index=100) == "other-year-00"


def test_generate_id_index_mapping():
"""Test that index mapping works correctly"""
# Test number cycling (0-99)
for i in range(100):
id_str = generate_id(index=i)
expected_num = f"{i:02d}"
assert id_str.endswith(f"-{expected_num}")
assert id_str.startswith("other-time-")

# Test noun advancement after 100 numbers
id_100 = generate_id(index=100)
assert id_100.startswith("other-year-00")

# Test adjective advancement (after all nouns * 100)
# This will depend on dictionary size, so let's test the pattern
from eval_protocol.human_id import dictionary

nouns_count = len(dictionary.nouns)
adjective_boundary = nouns_count * 100

id_at_boundary = generate_id(index=adjective_boundary)
# Should have advanced to the next adjective
assert not id_at_boundary.startswith("other-")


def test_generate_id_index_out_of_range():
"""Test that invalid indices raise appropriate errors"""
total = num_combinations()
assert total > 0
# Last valid index
generate_id(index=total - 1, word_count=3)
# First invalid index

# Last valid index should work
generate_id(index=total - 1)

# First invalid index should raise error
with pytest.raises(ValueError):
generate_id(index=total, word_count=3)
generate_id(index=total)

# Negative index should raise error
with pytest.raises(ValueError):
generate_id(index=-1, word_count=3)
generate_id(index=-1)


def test_generate_id_seed_stability_and_compat():
# Without index, same seed yields same id
def test_generate_id_seed_stability():
"""Test that same seed produces same ID"""
a = generate_id(seed=1234)
b = generate_id(seed=1234)
assert a == b

# Without index, default produces separator '-' and at least 3 components
c = generate_id()
assert re.match(r"^[a-z]+(-[a-z]+){2,}$", c)

assert re.match(r"^[a-z]+-[a-z]+-\d{2}$", c)


def test_generate_id_index_ignores_seed():
# With index provided, seed should affect the mapping deterministically
def test_generate_id_seed_with_index():
"""Test that seed affects index-based generation deterministically"""
x = generate_id(index=42, seed=1)
y = generate_id(index=42, seed=999)
z = generate_id(index=42, seed=1)
assert x != y

# Same seed should produce same result
assert x == z
# Different seeds should produce different results
assert x != y

# All should follow the correct format
assert re.match(r"^[a-z]+-[a-z]+-\d{2}$", x)
assert re.match(r"^[a-z]+-[a-z]+-\d{2}$", y)


def test_generate_id_random_format():
"""Test that random generation (no index) produces correct format"""
for _ in range(10):
id_str = generate_id()
assert re.match(r"^[a-z]+-[a-z]+-\d{2}$", id_str)
Loading