@@ -677,6 +677,7 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
677677 """
678678 Process a div and its nested content, handling various back section types.
679679 Supports nested divs for complex back sections like annex with multiple subsections.
680+ Also handles formula elements that are direct children of divs.
680681 """
681682 head = div .find ("head" )
682683 p_nodes = div .find_all ("p" )
@@ -691,10 +692,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
691692 if child .name == "div" or child .name .endswith (":div" ):
692693 nested_divs .append (child )
693694
694- # Count only direct child paragraphs, not those in nested divs
695+ # Count only direct child paragraphs and formulas , not those in nested divs
695696 direct_p_nodes = [child for child in div .children if hasattr (child , 'name' ) and child .name == "p" ]
697+ direct_formula_nodes = [child for child in div .children if hasattr (child , 'name' ) and child .name == "formula" ]
698+ has_direct_content = len (direct_p_nodes ) > 0 or len (direct_formula_nodes ) > 0
696699
697- if len (nested_divs ) > 0 and len ( direct_p_nodes ) == 0 :
700+ if len (nested_divs ) > 0 and not has_direct_content :
698701 # This is a container div - process each nested div independently
699702 for nested_div in nested_divs :
700703 # Skip references divs
@@ -707,11 +710,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
707710
708711 # Determine the section header and content type for divs with content
709712 if head :
710- if len ( direct_p_nodes ) == 0 :
711- # This div has only a head, no paragraphs (standalone head)
713+ if not has_direct_content :
714+ # This div has only a head, no paragraphs or formulas (standalone head)
712715 current_head_paragraph = self ._clean_text (head .get_text ())
713716 else :
714- # This div has both head and paragraphs - head is the section header
717+ # This div has both head and content - head is the section header
715718 head_section = self ._clean_text (head .get_text ())
716719 else :
717720 # If no head element, try to use the type attribute as head_section
@@ -726,35 +729,68 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
726729 head_section = "Author Contributions"
727730 elif div_type == "availability" :
728731 # Only set as default if this div has its own content
729- if len ( direct_p_nodes ) > 0 :
732+ if has_direct_content :
730733 head_section = "Data Availability"
731734 elif div_type == "annex" :
732735 head_section = "Annex"
733736 else :
734737 # Generic handling - capitalize and format
735738 head_section = div_type .replace ("_" , " " ).title ()
736739
737- # Process paragraphs in this div
738- if len (direct_p_nodes ) > 0 :
739- for id_p , p in enumerate (direct_p_nodes ):
740+ # Process direct children (paragraphs and formulas) in document order
741+ for child in div .children :
742+ if not hasattr (child , 'name' ) or not child .name :
743+ continue
744+
745+ if child .name == "p" :
740746 paragraph_id = get_random_id (prefix = "p_" )
741747
742748 if passage_level == "sentence" :
743- for id_s , sentence in enumerate (p .find_all ("s" )):
749+ for id_s , sentence in enumerate (child .find_all ("s" )):
744750 struct = get_formatted_passage (current_head_paragraph or head_paragraph , head_section , paragraph_id , sentence )
745751 if self .validate_refs :
746752 for ref in struct ['refs' ]:
747753 assert ref ['offset_start' ] < ref ['offset_end' ], "Wrong offsets"
748754 assert struct ['text' ][ref ['offset_start' ]:ref ['offset_end' ]] == ref ['text' ], "Cannot apply offsets"
749755 yield struct
750756 else :
751- struct = get_formatted_passage (current_head_paragraph or head_paragraph , head_section , paragraph_id , p )
757+ struct = get_formatted_passage (current_head_paragraph or head_paragraph , head_section , paragraph_id , child )
752758 if self .validate_refs :
753759 for ref in struct ['refs' ]:
754760 assert ref ['offset_start' ] < ref ['offset_end' ], "Wrong offsets"
755761 assert struct ['text' ][ref ['offset_start' ]:ref ['offset_end' ]] == ref ['text' ], "Cannot apply offsets"
756762 yield struct
757763
764+ elif child .name == "formula" :
765+ # Process formula elements as passages
766+ formula_id = get_random_id (prefix = "f_" )
767+ formula_text = self ._clean_text (child .get_text ())
768+
769+ if formula_text :
770+ # Create a passage structure for the formula
771+ formula_passage = {
772+ "id" : formula_id ,
773+ "text" : formula_text ,
774+ "coords" : [
775+ box_to_dict (coord .split ("," ))
776+ for coord in child .get ("coords" , "" ).split (";" )
777+ ] if child .has_attr ("coords" ) else [],
778+ "refs" : [],
779+ "type" : "formula"
780+ }
781+
782+ if current_head_paragraph or head_paragraph :
783+ formula_passage ["head_paragraph" ] = current_head_paragraph or head_paragraph
784+ if head_section :
785+ formula_passage ["head_section" ] = head_section
786+
787+ # Extract formula label if present
788+ label = child .find ("label" )
789+ if label :
790+ formula_passage ["label" ] = self ._clean_text (label .get_text ())
791+
792+ yield formula_passage
793+
758794 # Update head_paragraph for potential next div
759795 if current_head_paragraph is not None :
760796 head_paragraph = current_head_paragraph
0 commit comments