feat: [sc-26105] Add first/last name tokenizer to NameAI#606
feat: [sc-26105] Add first/last name tokenizer to NameAI#606
Conversation
|
|
The latest updates on your projects. Learn more about Vercel for Git ↗︎
|
| def test_person_name_tokenizer_simple_names(): | ||
| """Verify tokenization of clear person names.""" | ||
| with init_person_name_tokenizer([]) as tokenizer: | ||
| from nameai.data import get_resource_path | ||
| import json | ||
|
|
||
| with open(get_resource_path('tests/person_names_quality.json')) as f: | ||
| quality_tests = json.load(f) | ||
|
|
||
| failures = [] | ||
| for input_label, expected_tokens in quality_tests['simple_names'].items(): | ||
| tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) | ||
| expected_tuple = tuple(expected_tokens) | ||
| found = False | ||
| for tokens, score in tokenized_labels: | ||
| if tokens == expected_tuple: | ||
| found = True | ||
| assert score > -float('inf'), f'Expected valid score for {input_label}' | ||
| break | ||
| if not found: | ||
| failures.append(f'Failed to find expected tokenization for {input_label}') | ||
|
|
||
| if failures: | ||
| print('\n=== PersonNameTokenizer Quality Test Failures [simple_names] ===') | ||
| for failure in failures: | ||
| print(failure) | ||
| print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') | ||
| assert False, 'Some tokenization quality tests failed. See above for details.' | ||
|
|
||
|
|
||
| def test_person_name_tokenizer_ambiguous_names(): | ||
| """Verify handling of ambiguous inputs that could be names.""" | ||
| with init_person_name_tokenizer([]) as tokenizer: | ||
| from nameai.data import get_resource_path | ||
| import json | ||
|
|
||
| with open(get_resource_path('tests/person_names_quality.json')) as f: | ||
| quality_tests = json.load(f) | ||
|
|
||
| failures = [] | ||
| for input_label, interpretation2expected_tokens in quality_tests['ambiguous_names'].items(): | ||
| tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) | ||
| if interpretation2expected_tokens['person_name'] is not None: | ||
| person_name_tokens = tuple(interpretation2expected_tokens['person_name']) | ||
| found = False | ||
| for tokens, score in tokenized_labels: | ||
| if tokens == person_name_tokens: | ||
| found = True | ||
| assert score > -float('inf'), f'Expected valid score for {input_label}' | ||
| break | ||
| if not found: | ||
| failures.append(f'Failed to find person name tokenization for {input_label}') | ||
|
|
||
| if failures: | ||
| print('\n=== PersonNameTokenizer Quality Test Failures [ambiguous_names] ===') | ||
| for failure in failures: | ||
| print(failure) | ||
| print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') | ||
| assert False, 'Some tokenization quality tests failed. See above for details.' | ||
|
|
||
|
|
||
| def test_person_name_tokenizer_non_names_low_scores(): | ||
| """Verify that non-name inputs get low (< 1e-10) probability scores.""" | ||
| with init_person_name_tokenizer([]) as tokenizer: | ||
| from nameai.data import get_resource_path | ||
| import json | ||
|
|
||
| with open(get_resource_path('tests/person_names_quality.json')) as f: | ||
| quality_tests = json.load(f) | ||
|
|
||
| failures = [] | ||
| for input_label in quality_tests['non_names'].keys(): | ||
| tokenized_labels = list(tokenizer.tokenize_with_scores(input_label)) | ||
| for tokens, log_prob in tokenized_labels: | ||
| if log_prob >= math.log(1e-10): | ||
| failures.append(f'Expected very low score for non-name {input_label}, got {log_prob}') | ||
|
|
||
| if failures: | ||
| print('\n=== PersonNameTokenizer Quality Test Failures [non_names] ===') | ||
| for failure in failures: | ||
| print(failure) | ||
| print(f'\nTotal failures: {len(failures)} out of {len(quality_tests)} test cases') | ||
| assert False, 'Some tokenization quality tests failed. See above for details.' |
There was a problem hiding this comment.
Are these tests simply adding a probability score check compared to those from test_nlp_inspector.py?
There was a problem hiding this comment.
In test_tokenizer.py separate tokenizers are tested (AllTokenizer and PersonNamesTokenizer).
In test_nlp_inspector.py the tokenizations come from both tokenizers (merging in done in NLPInspector).
So these tests are for different levels of the tokenization pipeline.
|
It all seems good to me. One thing that is bothering me is maintaining 2 separate implementations of the same functionality. I would think of possibly substituting this functionality in NameGraph by using the implementation from here? @djstrong |
Story details: https://app.shortcut.com/ps-web3/story/26105
todo:
add s3 env vars to .env.examplemake bucket publicpython -m python -m nameai.downloadin ci/cd, deployment scripts