Skip to content

Commit 4ebff29

Browse files
committed
feat: Add support for processing and converting TEI formula elements to LossyJSON and Markdown, including new tests.
1 parent 96663de commit 4ebff29

3 files changed

Lines changed: 193 additions & 25 deletions

File tree

grobid_client/format/TEI2LossyJSON.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
677677
"""
678678
Process a div and its nested content, handling various back section types.
679679
Supports nested divs for complex back sections like annex with multiple subsections.
680+
Also handles formula elements that are direct children of divs.
680681
"""
681682
head = div.find("head")
682683
p_nodes = div.find_all("p")
@@ -691,10 +692,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
691692
if child.name == "div" or child.name.endswith(":div"):
692693
nested_divs.append(child)
693694

694-
# Count only direct child paragraphs, not those in nested divs
695+
# Count only direct child paragraphs and formulas, not those in nested divs
695696
direct_p_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "p"]
697+
direct_formula_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "formula"]
698+
has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0
696699

697-
if len(nested_divs) > 0 and len(direct_p_nodes) == 0:
700+
if len(nested_divs) > 0 and not has_direct_content:
698701
# This is a container div - process each nested div independently
699702
for nested_div in nested_divs:
700703
# Skip references divs
@@ -707,11 +710,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
707710

708711
# Determine the section header and content type for divs with content
709712
if head:
710-
if len(direct_p_nodes) == 0:
711-
# This div has only a head, no paragraphs (standalone head)
713+
if not has_direct_content:
714+
# This div has only a head, no paragraphs or formulas (standalone head)
712715
current_head_paragraph = self._clean_text(head.get_text())
713716
else:
714-
# This div has both head and paragraphs - head is the section header
717+
# This div has both head and content - head is the section header
715718
head_section = self._clean_text(head.get_text())
716719
else:
717720
# If no head element, try to use the type attribute as head_section
@@ -726,35 +729,68 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
726729
head_section = "Author Contributions"
727730
elif div_type == "availability":
728731
# Only set as default if this div has its own content
729-
if len(direct_p_nodes) > 0:
732+
if has_direct_content:
730733
head_section = "Data Availability"
731734
elif div_type == "annex":
732735
head_section = "Annex"
733736
else:
734737
# Generic handling - capitalize and format
735738
head_section = div_type.replace("_", " ").title()
736739

737-
# Process paragraphs in this div
738-
if len(direct_p_nodes) > 0:
739-
for id_p, p in enumerate(direct_p_nodes):
740+
# Process direct children (paragraphs and formulas) in document order
741+
for child in div.children:
742+
if not hasattr(child, 'name') or not child.name:
743+
continue
744+
745+
if child.name == "p":
740746
paragraph_id = get_random_id(prefix="p_")
741747

742748
if passage_level == "sentence":
743-
for id_s, sentence in enumerate(p.find_all("s")):
749+
for id_s, sentence in enumerate(child.find_all("s")):
744750
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
745751
if self.validate_refs:
746752
for ref in struct['refs']:
747753
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
748754
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
749755
yield struct
750756
else:
751-
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
757+
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
752758
if self.validate_refs:
753759
for ref in struct['refs']:
754760
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
755761
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
756762
yield struct
757763

764+
elif child.name == "formula":
765+
# Process formula elements as passages
766+
formula_id = get_random_id(prefix="f_")
767+
formula_text = self._clean_text(child.get_text())
768+
769+
if formula_text:
770+
# Create a passage structure for the formula
771+
formula_passage = {
772+
"id": formula_id,
773+
"text": formula_text,
774+
"coords": [
775+
box_to_dict(coord.split(","))
776+
for coord in child.get("coords", "").split(";")
777+
] if child.has_attr("coords") else [],
778+
"refs": [],
779+
"type": "formula"
780+
}
781+
782+
if current_head_paragraph or head_paragraph:
783+
formula_passage["head_paragraph"] = current_head_paragraph or head_paragraph
784+
if head_section:
785+
formula_passage["head_section"] = head_section
786+
787+
# Extract formula label if present
788+
label = child.find("label")
789+
if label:
790+
formula_passage["label"] = self._clean_text(label.get_text())
791+
792+
yield formula_passage
793+
758794
# Update head_paragraph for potential next div
759795
if current_head_paragraph is not None:
760796
head_paragraph = current_head_paragraph

grobid_client/format/TEI2Markdown.py

Lines changed: 59 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -213,14 +213,24 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
213213
head = div.find("head")
214214
if head:
215215
section_title = head.get_text().strip()
216-
fulltext_sections.append(f"### {section_title}\n")
217-
218-
# Get paragraphs
219-
paragraphs = div.find_all("p")
220-
for p in paragraphs:
221-
paragraph_text = self._process_paragraph(p)
222-
if paragraph_text.strip():
223-
fulltext_sections.append(f"{paragraph_text}\n\n")
216+
if section_title:
217+
fulltext_sections.append(f"### {section_title}\n")
218+
219+
# Process direct children of the div in document order
220+
# This captures paragraphs, formulas, and other elements as they appear
221+
for child in div.children:
222+
if not hasattr(child, 'name') or not child.name:
223+
continue
224+
225+
if child.name == "p":
226+
paragraph_text = self._process_paragraph(child)
227+
if paragraph_text.strip():
228+
fulltext_sections.append(f"{paragraph_text}\n\n")
229+
elif child.name == "formula":
230+
# Handle formula elements - extract text and optional label
231+
formula_text = self._process_formula(child)
232+
if formula_text.strip():
233+
fulltext_sections.append(f"{formula_text}\n\n")
224234

225235
return "".join(fulltext_sections)
226236

@@ -272,16 +282,23 @@ def _process_div_and_nested_divs(self, div: Tag, annex_sections: list) -> None:
272282
if header_text not in annex_sections:
273283
annex_sections.append(header_text)
274284

275-
# Process paragraphs that are direct children of this div (not in nested divs)
285+
# Process direct children of this div in document order
286+
# This captures paragraphs, formulas, and other elements as they appear
276287
for child in div.children:
277-
if hasattr(child, 'name') and child.name == "p":
288+
if not hasattr(child, 'name') or not child.name:
289+
continue
290+
291+
if child.name == "p":
278292
paragraph_text = self._process_paragraph(child)
279293
if paragraph_text.strip():
280294
annex_sections.append(f"{paragraph_text}\n\n")
281-
282-
# Process nested div elements
283-
for child in div.children:
284-
if hasattr(child, 'name') and child.name == "div":
295+
elif child.name == "formula":
296+
# Handle formula elements
297+
formula_text = self._process_formula(child)
298+
if formula_text.strip():
299+
annex_sections.append(f"{formula_text}\n\n")
300+
elif child.name == "div":
301+
# Process nested div elements
285302
self._process_div_and_nested_divs(child, annex_sections)
286303

287304
def _extract_references(self, soup: BeautifulSoup) -> str:
@@ -338,6 +355,34 @@ def _process_paragraph(self, p_element: Tag) -> str:
338355

339356
return "".join(text_parts).strip()
340357

358+
def _process_formula(self, formula_element: Tag) -> str:
359+
"""Process a formula element and convert to markdown.
360+
361+
Formulas are rendered as italicized text with optional equation label.
362+
"""
363+
# Get the main formula text (excluding the label)
364+
formula_text_parts = []
365+
label_text = ""
366+
367+
for child in formula_element.children:
368+
if hasattr(child, 'name') and child.name == "label":
369+
# Extract equation label (e.g., "(1)", "(2)")
370+
label_text = child.get_text().strip()
371+
elif isinstance(child, NavigableString):
372+
formula_text_parts.append(str(child))
373+
else:
374+
# Other elements within formula - get their text
375+
formula_text_parts.append(child.get_text())
376+
377+
formula_text = "".join(formula_text_parts).strip()
378+
379+
if formula_text:
380+
# Format as: *formula text* (label) if label exists
381+
if label_text:
382+
return f"*{formula_text}* {label_text}"
383+
return f"*{formula_text}*"
384+
return ""
385+
341386
def _table_to_markdown(self, table_element: Tag) -> str:
342387
"""Convert a table element to simple markdown."""
343388
markdown_lines = []

tests/test_conversions.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,92 @@ def test_offset_validation_for_specific_references(self):
622622
else:
623623
print("No offset differences detected between conversion and expected output")
624624

625+
def test_formula_extraction_in_json(self):
626+
"""Test that formula elements are extracted as passages in JSON conversion."""
627+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
628+
629+
# Use the actual TEI file from test resources which contains formulas
630+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
631+
632+
converter = TEI2LossyJSONConverter()
633+
json_data = converter.convert_tei_file(tei_file, stream=False)
634+
635+
# Find formula passages
636+
body_text = json_data.get('body_text', [])
637+
formula_passages = [p for p in body_text if p.get('type') == 'formula']
638+
639+
# The test file contains 2 formulas
640+
assert len(formula_passages) >= 2, "Should extract at least 2 formulas from test file"
641+
642+
# Check formula structure
643+
for formula in formula_passages:
644+
assert 'text' in formula, "Formula should have text"
645+
assert 'id' in formula, "Formula should have id"
646+
assert formula['text'].strip(), "Formula text should not be empty"
647+
648+
# The test formulas have labels
649+
if 'label' in formula:
650+
assert formula['label'].strip(), "Formula label should not be empty"
651+
652+
# Check for specific formula content from test file
653+
formula_texts = [f.get('text', '') for f in formula_passages]
654+
assert any('Fext' in t for t in formula_texts), "Should extract formula containing 'Fext'"
655+
656+
def test_formula_extraction_in_markdown(self):
657+
"""Test that formula elements are included in Markdown conversion."""
658+
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
659+
660+
# Use the actual TEI file from test resources which contains formulas
661+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
662+
663+
converter = TEI2MarkdownConverter()
664+
markdown = converter.convert_tei_file(tei_file)
665+
666+
# Check that formula content is present
667+
assert 'Fext' in markdown, "Markdown should contain formula text 'Fext'"
668+
669+
# Check that formula is italicized (surrounded by asterisks)
670+
assert '*Fext' in markdown, "Formula should be italicized in Markdown"
671+
672+
def test_header_only_div_in_json(self):
673+
"""Test that headers without paragraphs are included in JSON conversion."""
674+
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
675+
676+
# Use the actual TEI file from test resources
677+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
678+
679+
converter = TEI2LossyJSONConverter()
680+
json_data = converter.convert_tei_file(tei_file, stream=False)
681+
682+
# Collect all section headers from the body_text
683+
body_text = json_data.get('body_text', [])
684+
section_headers = set()
685+
for passage in body_text:
686+
if 'head_section' in passage and passage['head_section']:
687+
section_headers.add(passage['head_section'])
688+
689+
# Check that common sections are present
690+
# The test file has Acknowledgements and Competing interests headers
691+
assert 'Competing interests' in section_headers, "Should include 'Competing interests' header"
692+
693+
# Verify we have a good number of sections
694+
assert len(section_headers) >= 10, f"Should extract many section headers, got {len(section_headers)}"
695+
696+
def test_header_only_div_in_markdown(self):
697+
"""Test that headers without paragraphs are included in Markdown conversion."""
698+
from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
699+
700+
# Use the actual TEI file from test resources
701+
tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
702+
703+
converter = TEI2MarkdownConverter()
704+
markdown = converter.convert_tei_file(tei_file)
705+
706+
# Check that the Acknowledgements header is present
707+
assert 'Acknowledgements' in markdown, "Markdown should contain 'Acknowledgements' header"
708+
709+
# Check it's formatted as a header (preceded by ###)
710+
assert '### Acknowledgements' in markdown, "Acknowledgements should be formatted as Markdown header"
625711

626712
def test_conversion_JSON(self):
627713
from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
@@ -648,3 +734,4 @@ def test_conversion_JSON(self):
648734
actual_text = paragraph_text[offset_start:offset_end]
649735
assert actual_text == ref_text, f"Reference text at offsets ({offset_start}-{offset_end}) should match '{ref_text}' but got '{actual_text}'"
650736

737+

0 commit comments

Comments
 (0)