Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions sec_parser/utils/bs4_/text_styles_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@
if TYPE_CHECKING: # pragma: no cover
from bs4 import Tag

# HTML semantic tags that imply CSS styles, even without an explicit style attribute.
# This allows the classifier to detect bold/italic/underline applied via <b>, <strong>,
# <i>, <em>, and <u> tags in addition to inline CSS (e.g. font-weight: bold).
_SEMANTIC_TAG_STYLES: dict[str, dict[str, str]] = {
"b": {"font-weight": "bold"},
"strong": {"font-weight": "bold"},
"i": {"font-style": "italic"},
"em": {"font-style": "italic"},
"u": {"text-decoration": "underline"},
}


def compute_text_styles_metrics(tag: Tag) -> dict[tuple[str, str], float]:
"""
Expand Down Expand Up @@ -57,10 +68,19 @@ def _compute_effective_style(tag: Tag) -> dict[str, str]:
"""
Aggregate the effective styles for a given tag by
traversing up the parent hierarchy.

In addition to inline ``style`` attributes, this function also recognises
semantic HTML tags (``<b>``, ``<strong>``, ``<i>``, ``<em>``, ``<u>``)
whose names imply specific CSS properties. This ensures that text
emphasised via bare ``<b>`` or ``<strong>`` tags (rather than via
``style="font-weight:bold"``) is correctly detected as highlighted.
"""
effective_styles: dict[str, str] = {}
found_tag: Tag | None = tag
while found_tag:
# Inline style attributes take precedence over semantic tag names, so
# process them first (setdefault ensures the innermost/highest-specificity
# rule wins as we traverse toward the root).
if "style" in found_tag.attrs:
found_styles = found_tag["style"]
if isinstance(found_styles, list): # pragma: no cover
Expand All @@ -76,5 +96,14 @@ def _compute_effective_style(tag: Tag) -> dict[str, str]:
val = val.strip()
# Only set if not previously set to respect CSS cascading rules
effective_styles.setdefault(prop, val)

# After inline styles, check for implied styles from semantic HTML tag
# names (e.g. <b>, <strong>, <i>, <em>, <u>). setdefault ensures an
# explicit inline style on the same element still wins.
tag_name = getattr(found_tag, "name", None)
if tag_name and tag_name in _SEMANTIC_TAG_STYLES:
for prop, val in _SEMANTIC_TAG_STYLES[tag_name].items():
effective_styles.setdefault(prop, val)

found_tag = found_tag.find_parent()
return effective_styles
22 changes: 22 additions & 0 deletions tests/unit/processing_steps/test_highlighted_text_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,28 @@
{"type": NotYetClassifiedElement, "tag": "p"},
],
),
(
"bold via <b> tag",
"""
<p><b>Bold text using b tag</b></p>
<p>Regular text without bold</p>
""",
[
{"type": TitleElement, "tag": "p"},
{"type": TextElement, "tag": "p"},
],
),
(
"bold via <strong> tag",
"""
<p><strong>Bold text using strong tag</strong></p>
<p>Regular text without bold</p>
""",
[
{"type": TitleElement, "tag": "p"},
{"type": TextElement, "tag": "p"},
],
),
],
ids=[v[0] for v in values],
)
Expand Down
72 changes: 72 additions & 0 deletions tests/unit/utils/bs4_/test_text_styles_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,75 @@ def test_should_return_correct_metrics_for_inherited_styles():
# Assert
assert result[("color", "#000000")] == 100.0
assert result[("font-weight", "600")] == 50.0


def test_should_detect_bold_from_b_tag():
"""Text wrapped in <b> should be detected as font-weight:bold."""
# Arrange
html = "<p><b>Bold text</b></p>"
soup = BeautifulSoup(html, "lxml")
p_tag = soup.find("p")

# Act
result = compute_text_styles_metrics(p_tag)

# Assert
assert result[("font-weight", "bold")] == 100.0


def test_should_detect_bold_from_strong_tag():
"""Text wrapped in <strong> should be detected as font-weight:bold."""
# Arrange
html = "<p><strong>Bold text</strong></p>"
soup = BeautifulSoup(html, "lxml")
p_tag = soup.find("p")

# Act
result = compute_text_styles_metrics(p_tag)

# Assert
assert result[("font-weight", "bold")] == 100.0


def test_should_detect_italic_from_i_tag():
"""Text wrapped in <i> should be detected as font-style:italic."""
# Arrange
html = "<p><i>Italic text</i></p>"
soup = BeautifulSoup(html, "lxml")
p_tag = soup.find("p")

# Act
result = compute_text_styles_metrics(p_tag)

# Assert
assert result[("font-style", "italic")] == 100.0


def test_should_detect_italic_from_em_tag():
"""Text wrapped in <em> should be detected as font-style:italic."""
# Arrange
html = "<p><em>Italic text</em></p>"
soup = BeautifulSoup(html, "lxml")
p_tag = soup.find("p")

# Act
result = compute_text_styles_metrics(p_tag)

# Assert
assert result[("font-style", "italic")] == 100.0


def test_inline_style_should_override_semantic_tag():
"""Explicit inline style takes precedence over implied style from tag name."""
# Arrange - <b> implies bold, but inline style overrides with 400 (normal weight)
html = '<p><b style="font-weight:400">Not actually bold</b></p>'
soup = BeautifulSoup(html, "lxml")
p_tag = soup.find("p")

# Act
result = compute_text_styles_metrics(p_tag)

# Assert — the inline style (400) wins because it is processed before the
# semantic tag name, so setdefault skips the "bold" implied value.
assert result[("font-weight", "400")] == 100.0
assert ("font-weight", "bold") not in result