Skip to content

Commit 31d9cc2

Browse files
committed
fix(webvtt): deal with HTML entities in cue text spans
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
1 parent bc041b5 commit 31d9cc2

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed

docling_core/types/doc/webvtt.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,23 @@ class _WebVTTCueTextSpan(BaseModel):
9898
text: str
9999
span_type: Literal["text"] = "text"
100100

101+
_valid_entities: ClassVar[set] = {"amp", "lt", "gt", "lrm", "rlm", "nbsp"}
102+
_entity_pattern: ClassVar[re.Pattern] = re.compile(r"&([a-zA-Z0-9]+);")
103+
101104
@field_validator("text", mode="after")
102105
@classmethod
103106
def validate_text(cls, value: str) -> str:
104-
if any(ch in value for ch in {"\n", "\r", "&", "<"}):
107+
for match in cls._entity_pattern.finditer(value):
108+
entity = match.group(1)
109+
if entity not in cls._valid_entities:
110+
raise ValueError(
111+
f"Cue text span contains an invalid HTML entity: &{entity};"
112+
)
113+
if "&" in re.sub(cls._entity_pattern, "", value):
114+
raise ValueError(
115+
"Found '&' not part of a valid entity in the cue text span"
116+
)
117+
if any(ch in value for ch in {"\n", "\r", "<"}):
105118
raise ValueError("Cue text span contains invalid characters")
106119
if len(value) == 0:
107120
raise ValueError("Cue text span cannot be empty")

test/test_webvtt.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,12 @@ def test_vtt_cue_commponents():
9696
invalid_text = "This cue text span contains &."
9797
with pytest.raises(ValidationError):
9898
_WebVTTCueTextSpan(text=invalid_text)
99+
invalid_text = "An invalid &foo; entity"
100+
with pytest.raises(ValidationError):
101+
_WebVTTCueTextSpan(text=invalid_text)
102+
valid_text = "My favorite book is Pride &amp; Prejudice"
103+
span = _WebVTTCueTextSpan(text=valid_text)
104+
assert span.text == valid_text
99105

100106
"""Test with text containing less-than sign."""
101107
invalid_text = "This cue text span contains <."

0 commit comments

Comments
 (0)