diff --git a/src/diffenator2/html.py b/src/diffenator2/html.py index e25530d..cfc2c67 100644 --- a/src/diffenator2/html.py +++ b/src/diffenator2/html.py @@ -85,8 +85,9 @@ def diffenator_font_style(dfont, suffix=""): def filtered_font_sample_text(ttFont, characters): - sample_text = font_sample_text(ttFont) - sample_text = [w for w in sample_text if characters_in_string(w, characters)] + font_characters = set(chr(c) for c in ttFont.getBestCmap()) + characters = set(characters) & font_characters + sample_text = font_sample_text(tuple(sorted(characters))) return " ".join(sample_text) diff --git a/src/diffenator2/utils.py b/src/diffenator2/utils.py index 8acfcbb..f464c17 100644 --- a/src/diffenator2/utils.py +++ b/src/diffenator2/utils.py @@ -132,39 +132,53 @@ def gen_gif(img_a_path: str, img_b_path: str, dst: str): img_a.save(dst, save_all=True, append_images=[img_b], loop=10000, duration=1000) +def greedy_set_cover(universe, sets): + uncovered_elements = set(universe) + best_sets = set() + + prev_count = float("inf") + while uncovered_elements: + if len(uncovered_elements) == prev_count: + break + prev_count = len(uncovered_elements) + best_set = max(sets, key=lambda s: len(set(s) & uncovered_elements)) + uncovered_elements -= set(best_set) + best_sets.add(best_set) + return sorted(best_sets, key=lambda k: ord(k[-1])) + + @lru_cache() -def font_sample_text(ttFont: TTFont) -> str: +def font_sample_text(font_characters): """Collect words which exist in the Universal Declaration of Human Rights that can be formed using the ttFont instance. UDHR has been chosen due to the many languages it covers""" with open( resource_filename("diffenator2", "data/udhr_all.txt"), encoding="utf8" ) as doc: - uhdr = doc.read() - - cmap = set(ttFont.getBestCmap()) - words = [] - seen_chars = set() - - def _add_words(words, text, seen_chars): - for word in text.split(): - chars = set(ord(l) for l in word) - if not chars.issubset(cmap): - continue - if chars & seen_chars == chars: - continue - seen_chars |= chars - words.append(word) - - _add_words(words, uhdr, seen_chars) - - if len(seen_chars) < len(cmap): - languages = LoadLanguages() - for file, proto in languages.items(): - if hasattr(proto, "sample_text"): - for _, text in proto.sample_text.ListFields(): - _add_words(words, text, seen_chars) - return words + words = re.split(r"[\b\W\b]+", doc.read()) + + # GF languages sample text + languages = LoadLanguages() + for file, proto in languages.items(): + if hasattr(proto, "sample_text"): + for _, text in proto.sample_text.ListFields(): + words += re.split(r"[\b\W\b]+", text) + + # remove all anagram words and words that are not in font cmap + seen = set() + new_words = set() + for word in sorted(words, key=lambda k: len(k), reverse=True): + word_set = set(word) + if len(word_set) != len(word): + continue + if word_set.issubset(seen): + continue + if word_set.issubset(font_characters): + seen |= word_set + new_words.add(word) + + unique_words = greedy_set_cover(font_characters, new_words) + return unique_words def font_family_name(ttFont, suffix=""): diff --git a/tests/test_functional.py b/tests/test_functional.py index 06737ec..7d663b5 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -95,9 +95,9 @@ def test_diffenator_threshold(fp_before, fp_after, threshold, has, missing): @pytest.mark.parametrize( "fp, cmd, pattern, has, missing", [ - (mavenpro_vf, "proof", ".*", ['>an tan'], []), + (mavenpro_vf, "proof", ".*", ['>tan'], []), (mavenpro_vf, "proof", "[an]{1,2}", ['>an'], []), - (mavenpro_vf, "diff", ".*", ['>an tan'], []), + (mavenpro_vf, "diff", ".*", ['>tan'], []), (mavenpro_vf, "diff", "[an]{1,2}", ['>an'], []), ] )