Skip to content

Commit 732be65

Browse files
author
Halvani
committed
.
1 parent 6948604 commit 732be65

1 file changed

Lines changed: 135 additions & 0 deletions

File tree

tests/test_textunit_core_basic.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# TextUnitLib/tests/test_textunit_core_basic.py
2+
3+
from __future__ import annotations
4+
5+
import pytest
6+
import spacy
7+
8+
from textunitlib.core import TextUnit
9+
10+
11+
# ----------------------------------------------------------------------
12+
# Fixtures
13+
# ----------------------------------------------------------------------
14+
15+
16+
@pytest.fixture
17+
def tu_en() -> TextUnit:
18+
"""English TextUnit with a simple blank spaCy pipeline."""
19+
nlp = spacy.blank("en")
20+
return TextUnit(language=TextUnit.Language.English, nlp=nlp)
21+
22+
23+
# ----------------------------------------------------------------------
24+
# characters()
25+
# ----------------------------------------------------------------------
26+
27+
28+
def test_characters_returns_all_chars_in_order(tu_en: TextUnit):
29+
text = " Text\nUnit 42 "
30+
result = tu_en.characters(text)
31+
32+
# Should be exactly the list of characters from the string
33+
assert result == list(text)
34+
35+
36+
def test_characters_can_drop_whitespaces(tu_en: TextUnit):
37+
text = " A\tB \nC "
38+
result = tu_en.characters(text, drop_whitespaces=True)
39+
40+
# All whitespace removed, order of remaining chars preserved
41+
assert result == ["A", "B", "C"]
42+
43+
44+
def test_characters_empty_string(tu_en: TextUnit):
45+
assert tu_en.characters("") == []
46+
47+
48+
# ----------------------------------------------------------------------
49+
# spaces()
50+
# ----------------------------------------------------------------------
51+
52+
53+
def test_spaces_extracts_only_whitespace(tu_en: TextUnit):
54+
text = " A\tB \nC"
55+
result = tu_en.spaces(text)
56+
57+
# All whitespace characters in order
58+
assert result == [" ", "\t", " ", "\n"]
59+
60+
61+
def test_spaces_no_whitespace_yields_empty_list(tu_en: TextUnit):
62+
assert tu_en.spaces("NoSpacesHere") == []
63+
64+
65+
# ----------------------------------------------------------------------
66+
# punctuation_marks()
67+
# ----------------------------------------------------------------------
68+
69+
70+
def test_punctuation_marks_uses_configured_punctuation(tu_en: TextUnit):
71+
punct_set = tu_en.prop_punctuation
72+
# If no punctuation resource is available, skip this test
73+
if len(punct_set) < 1:
74+
pytest.skip("No punctuation resource loaded.")
75+
76+
# Take one or two punctuation characters from the configured set
77+
punct_list = list(punct_set)
78+
p1 = punct_list[0]
79+
p2 = punct_list[1] if len(punct_list) > 1 else punct_list[0]
80+
81+
text = f"A{p1}B{p2}C"
82+
result = tu_en.punctuation_marks(text)
83+
84+
# We only inserted these two punctuation chars, so we expect them in order
85+
assert result == [p1, p2]
86+
87+
88+
def test_punctuation_marks_ignores_non_punctuation(tu_en: TextUnit):
89+
text = "Hello World"
90+
result = tu_en.punctuation_marks(text)
91+
assert result == []
92+
93+
94+
# ----------------------------------------------------------------------
95+
# vowels()
96+
# ----------------------------------------------------------------------
97+
98+
99+
def test_vowels_ignores_consonants(tu_en: TextUnit):
100+
text = "bcdfg"
101+
assert tu_en.vowels(text) == []
102+
103+
104+
# ----------------------------------------------------------------------
105+
# letters()
106+
# ----------------------------------------------------------------------
107+
108+
109+
def test_letters_returns_only_alphabetic_chars(tu_en: TextUnit):
110+
text = "Te2xt-Ü!"
111+
result = tu_en.letters(text)
112+
113+
# Only alphabetic characters in order, including Unicode letters
114+
assert result == ["T", "e", "x", "t", "Ü"]
115+
116+
117+
def test_letters_empty_if_no_alpha(tu_en: TextUnit):
118+
text = "1234 !!!"
119+
assert tu_en.letters(text) == []
120+
121+
122+
# ----------------------------------------------------------------------
123+
# digits()
124+
# ----------------------------------------------------------------------
125+
126+
127+
def test_digits_returns_only_digits(tu_en: TextUnit):
128+
text = "abc123xyz456"
129+
result = tu_en.digits(text)
130+
assert result == ["1", "2", "3", "4", "5", "6"]
131+
132+
133+
def test_digits_empty_if_no_digits(tu_en: TextUnit):
134+
text = "No digits here!"
135+
assert tu_en.digits(text) == []

0 commit comments

Comments
 (0)