Merge pull request #102 from kermitt2/bugfix/include-formula

lfoppiano · web-flow · commit b469e649ceea · 2026-01-04T10:06:54.000Z
include formula in the JSON and MD output
diff --git a/grobid_client/format/TEI2LossyJSON.py b/grobid_client/format/TEI2LossyJSON.py
@@ -677,6 +677,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
         """
         Process a div and its nested content, handling various back section types.
         Supports nested divs for complex back sections like annex with multiple subsections.
+        Also handles formula elements that are direct children of divs.
         """
         head = div.find("head")
         p_nodes = div.find_all("p")
@@ -691,10 +692,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
                 if child.name == "div" or child.name.endswith(":div"):
                     nested_divs.append(child)
 
-        # Count only direct child paragraphs, not those in nested divs
+        # Count only direct child paragraphs and formulas, not those in nested divs
         direct_p_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "p"]
+        direct_formula_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "formula"]
+        has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0
 
-        if len(nested_divs) > 0 and len(direct_p_nodes) == 0:
+        if len(nested_divs) > 0 and not has_direct_content:
             # This is a container div - process each nested div independently
             for nested_div in nested_divs:
                 # Skip references divs
@@ -707,11 +710,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
 
         # Determine the section header and content type for divs with content
         if head:
-            if len(direct_p_nodes) == 0:
-                # This div has only a head, no paragraphs (standalone head)
+            if not has_direct_content:
+                # This div has only a head, no paragraphs or formulas (standalone head)
                 current_head_paragraph = self._clean_text(head.get_text())
             else:
-                # This div has both head and paragraphs - head is the section header
+                # This div has both head and content - head is the section header
                 head_section = self._clean_text(head.get_text())
         else:
             # If no head element, try to use the type attribute as head_section
@@ -726,35 +729,68 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
                     head_section = "Author Contributions"
                 elif div_type == "availability":
                     # Only set as default if this div has its own content
-                    if len(direct_p_nodes) > 0:
+                    if has_direct_content:
                         head_section = "Data Availability"
                 elif div_type == "annex":
                     head_section = "Annex"
                 else:
                     # Generic handling - capitalize and format
                     head_section = div_type.replace("_", " ").title()
 
-        # Process paragraphs in this div
-        if len(direct_p_nodes) > 0:
-            for id_p, p in enumerate(direct_p_nodes):
+        # Process direct children (paragraphs and formulas) in document order
+        for child in div.children:
+            if not hasattr(child, 'name') or not child.name:
+                continue
+
+            if child.name == "p":
                 paragraph_id = get_random_id(prefix="p_")
 
                 if passage_level == "sentence":
-                    for id_s, sentence in enumerate(p.find_all("s")):
+                    for id_s, sentence in enumerate(child.find_all("s")):
                         struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
                         if self.validate_refs:
                             for ref in struct['refs']:
                                 assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
                                 assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
                         yield struct
                 else:
-                    struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
+                    struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
                     if self.validate_refs:
                         for ref in struct['refs']:
                             assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
                             assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
                     yield struct
 
+            elif child.name == "formula":
+                # Process formula elements as passages
+                formula_id = get_random_id(prefix="f_")
+                formula_text = self._clean_text(child.get_text())
+                
+                if formula_text:
+                    # Create a passage structure for the formula
+                    formula_passage = {
+                        "id": formula_id,
+                        "text": formula_text,
+                        "coords": [
+                            box_to_dict(coord.split(","))
+                            for coord in child.get("coords", "").split(";")
+                        ] if child.has_attr("coords") else [],
+                        "refs": [],
+                        "type": "formula"
+                    }
+                    
+                    if current_head_paragraph or head_paragraph:
+                        formula_passage["head_paragraph"] = current_head_paragraph or head_paragraph
+                    if head_section:
+                        formula_passage["head_section"] = head_section
+                    
+                    # Extract formula label if present
+                    label = child.find("label")
+                    if label:
+                        formula_passage["label"] = self._clean_text(label.get_text())
+                    
+                    yield formula_passage
+
         # Update head_paragraph for potential next div
         if current_head_paragraph is not None:
             head_paragraph = current_head_paragraph
diff --git a/grobid_client/format/TEI2Markdown.py b/grobid_client/format/TEI2Markdown.py
@@ -213,14 +213,24 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
             head = div.find("head")
             if head:
                 section_title = head.get_text().strip()
-                fulltext_sections.append(f"### {section_title}\n")
-
-            # Get paragraphs
-            paragraphs = div.find_all("p")
-            for p in paragraphs:
-                paragraph_text = self._process_paragraph(p)
-                if paragraph_text.strip():
-                    fulltext_sections.append(f"{paragraph_text}\n\n")
+                if section_title:
+                    fulltext_sections.append(f"### {section_title}\n")
+
+            # Process direct children of the div in document order
+            # This captures paragraphs, formulas, and other elements as they appear
+            for child in div.children:
+                if not hasattr(child, 'name') or not child.name:
+                    continue
+                    
+                if child.name == "p":
+                    paragraph_text = self._process_paragraph(child)
+                    if paragraph_text.strip():
+                        fulltext_sections.append(f"{paragraph_text}\n\n")
+                elif child.name == "formula":
+                    # Handle formula elements - extract text and optional label
+                    formula_text = self._process_formula(child)
+                    if formula_text.strip():
+                        fulltext_sections.append(f"{formula_text}\n\n")
         
         return "".join(fulltext_sections)
 
@@ -272,16 +282,23 @@ def _process_div_and_nested_divs(self, div: Tag, annex_sections: list) -> None:
             if header_text not in annex_sections:
                 annex_sections.append(header_text)
 
-        # Process paragraphs that are direct children of this div (not in nested divs)
+        # Process direct children of this div in document order
+        # This captures paragraphs, formulas, and other elements as they appear
         for child in div.children:
-            if hasattr(child, 'name') and child.name == "p":
+            if not hasattr(child, 'name') or not child.name:
+                continue
+                
+            if child.name == "p":
                 paragraph_text = self._process_paragraph(child)
                 if paragraph_text.strip():
                     annex_sections.append(f"{paragraph_text}\n\n")
-
-        # Process nested div elements
-        for child in div.children:
-            if hasattr(child, 'name') and child.name == "div":
+            elif child.name == "formula":
+                # Handle formula elements
+                formula_text = self._process_formula(child)
+                if formula_text.strip():
+                    annex_sections.append(f"{formula_text}\n\n")
+            elif child.name == "div":
+                # Process nested div elements
                 self._process_div_and_nested_divs(child, annex_sections)
 
     def _extract_references(self, soup: BeautifulSoup) -> str:
@@ -338,6 +355,34 @@ def _process_paragraph(self, p_element: Tag) -> str:
         
         return "".join(text_parts).strip()
 
+    def _process_formula(self, formula_element: Tag) -> str:
+        """Process a formula element and convert to markdown.
+        
+        Formulas are rendered as italicized text with optional equation label.
+        """
+        # Get the main formula text (excluding the label)
+        formula_text_parts = []
+        label_text = ""
+        
+        for child in formula_element.children:
+            if hasattr(child, 'name') and child.name == "label":
+                # Extract equation label (e.g., "(1)", "(2)")
+                label_text = child.get_text().strip()
+            elif isinstance(child, NavigableString):
+                formula_text_parts.append(str(child))
+            else:
+                # Other elements within formula - get their text
+                formula_text_parts.append(child.get_text())
+        
+        formula_text = "".join(formula_text_parts).strip()
+        
+        if formula_text:
+            # Format as: *formula text* (label) if label exists
+            if label_text:
+                return f"*{formula_text}* {label_text}"
+            return f"*{formula_text}*"
+        return ""
+
     def _table_to_markdown(self, table_element: Tag) -> str:
         """Convert a table element to simple markdown."""
         markdown_lines = []
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -622,6 +622,92 @@ def test_offset_validation_for_specific_references(self):
                 else:
                     print("No offset differences detected between conversion and expected output")
 
+    def test_formula_extraction_in_json(self):
+        """Test that formula elements are extracted as passages in JSON conversion."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Use the actual TEI file from test resources which contains formulas
+        tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
+
+        converter = TEI2LossyJSONConverter()
+        json_data = converter.convert_tei_file(tei_file, stream=False)
+
+        # Find formula passages
+        body_text = json_data.get('body_text', [])
+        formula_passages = [p for p in body_text if p.get('type') == 'formula']
+
+        # The test file contains 2 formulas
+        assert len(formula_passages) >= 2, "Should extract at least 2 formulas from test file"
+
+        # Check formula structure
+        for formula in formula_passages:
+            assert 'text' in formula, "Formula should have text"
+            assert 'id' in formula, "Formula should have id"
+            assert formula['text'].strip(), "Formula text should not be empty"
+            
+            # The test formulas have labels
+            if 'label' in formula:
+                assert formula['label'].strip(), "Formula label should not be empty"
+
+        # Check for specific formula content from test file
+        formula_texts = [f.get('text', '') for f in formula_passages]
+        assert any('Fext' in t for t in formula_texts), "Should extract formula containing 'Fext'"
+
+    def test_formula_extraction_in_markdown(self):
+        """Test that formula elements are included in Markdown conversion."""
+        from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
+
+        # Use the actual TEI file from test resources which contains formulas
+        tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
+
+        converter = TEI2MarkdownConverter()
+        markdown = converter.convert_tei_file(tei_file)
+
+        # Check that formula content is present
+        assert 'Fext' in markdown, "Markdown should contain formula text 'Fext'"
+        
+        # Check that formula is italicized (surrounded by asterisks)
+        assert '*Fext' in markdown, "Formula should be italicized in Markdown"
+
+    def test_header_only_div_in_json(self):
+        """Test that headers without paragraphs are included in JSON conversion."""
+        from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
+
+        # Use the actual TEI file from test resources
+        tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
+
+        converter = TEI2LossyJSONConverter()
+        json_data = converter.convert_tei_file(tei_file, stream=False)
+
+        # Collect all section headers from the body_text
+        body_text = json_data.get('body_text', [])
+        section_headers = set()
+        for passage in body_text:
+            if 'head_section' in passage and passage['head_section']:
+                section_headers.add(passage['head_section'])
+
+        # Check that common sections are present
+        # The test file has Acknowledgements and Competing interests headers
+        assert 'Competing interests' in section_headers, "Should include 'Competing interests' header"
+        
+        # Verify we have a good number of sections
+        assert len(section_headers) >= 10, f"Should extract many section headers, got {len(section_headers)}"
+
+    def test_header_only_div_in_markdown(self):
+        """Test that headers without paragraphs are included in Markdown conversion."""
+        from grobid_client.format.TEI2Markdown import TEI2MarkdownConverter
+
+        # Use the actual TEI file from test resources
+        tei_file = os.path.join(TEST_DATA_PATH, '0046d83a-edd6-4631-b57c-755cdcce8b7f.tei.xml')
+
+        converter = TEI2MarkdownConverter()
+        markdown = converter.convert_tei_file(tei_file)
+
+        # Check that the Acknowledgements header is present
+        assert 'Acknowledgements' in markdown, "Markdown should contain 'Acknowledgements' header"
+        
+        # Check it's formatted as a header (preceded by ###)
+        assert '### Acknowledgements' in markdown, "Acknowledgements should be formatted as Markdown header"
 
     def test_conversion_JSON(self):
         from grobid_client.format.TEI2LossyJSON import TEI2LossyJSONConverter
@@ -648,3 +734,4 @@ def test_conversion_JSON(self):
                         actual_text = paragraph_text[offset_start:offset_end]
                         assert actual_text == ref_text, f"Reference text at offsets ({offset_start}-{offset_end}) should match '{ref_text}' but got '{actual_text}'"
 
+