@@ -139,14 +139,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
139139 box_to_dict (coord .split ("," ))
140140 for coord in sentence ['coords' ].split (";" )
141141 ] if sentence .has_attr ("coords" ) else [],
142- "refs" : [
143- {
144- "type" : ref ["type" ],
145- "target" : ref ["target" ] if "target" in ref .attrs else "" ,
146- "text" : ref .text
147- }
148- for ref in sentence .find_all ("ref" , type = "bibr" )
149- ]
142+ "refs" : get_refs_with_offsets (sentence )
150143 }
151144 for id , sentence in enumerate (paragraph .find_all ("s" ))
152145 ]
@@ -161,14 +154,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
161154 box_to_dict (coord .split ("," ))
162155 for coord in paragraph ['coords' ].split (";" )
163156 ] if paragraph .has_attr ("coords" ) else [],
164- "refs" : [
165- {
166- "type" : ref ["type" ],
167- "target" : ref ["target" ] if "target" in ref .attrs else "" ,
168- "text" : ref .text
169- }
170- for ref in paragraph .find_all ("ref" , type = "bibr" )
171- ]
157+ "refs" : get_refs_with_offsets (paragraph )
172158 }
173159 for id , paragraph in enumerate (abstract_paragraph_nodes )
174160 ]
@@ -826,51 +812,89 @@ def get_random_id(prefix=""):
826812def get_refs_with_offsets (element ):
827813 """Extract references with their text offsets from an element."""
828814 refs = []
829- text = element .get_text ()
830815
831- # Use BeautifulSoup's position tracking by traversing the DOM tree
832- # and keeping track of character positions as we encounter text
816+ # Apply the same text cleaning as get_formatted_passage
817+ def _clean_text (text : str ) -> str :
818+ if not text :
819+ return ""
820+ import re
821+ import html
822+ text = re .sub (r'\s+' , ' ' , text .strip ())
823+ text = html .unescape (text )
824+ return text
833825
834- def find_ref_positions (node , start_pos = 0 ):
826+ # Now extract references with offsets based on the cleaned text
827+ def traverse_and_collect (node , current_pos = 0 ):
835828 """
836- Recursively traverse the DOM tree to find reference positions.
837- Returns the next character position after processing this node.
829+ Recursively traverse the DOM tree, building cleaned text content and tracking exact positions.
830+ Returns tuple: (text_content, next_position)
838831 """
839832 if hasattr (node , 'name' ) and node .name :
840833 # This is an element node
841834 if node .name == "ref" and node .get ("type" ) == "bibr" :
842- # Found a reference - get its text and calculate position
843- ref_text = node .get_text ()
844- # Find this reference text starting from current position
845- actual_start = text .find (ref_text , start_pos )
846- if actual_start != - 1 :
847- actual_end = actual_start + len (ref_text )
835+ # Found a reference - get its cleaned text and record its exact position
836+ ref_text = _clean_text (node .get_text ())
837+ if ref_text : # Only record non-empty references
848838 refs .append ({
849839 "type" : node .get ("type" , "" ),
850840 "target" : node .get ("target" , "" ),
851841 "text" : ref_text ,
852- "offset_start" : actual_start ,
853- "offset_end" : actual_end
842+ "offset_start" : current_pos ,
843+ "offset_end" : current_pos + len ( ref_text )
854844 })
855- return actual_end
856- else :
857- logger .warning (f"Could not find reference text '{ ref_text } ' at position { start_pos } " )
858- return start_pos
845+ # Return the cleaned reference text and advance position
846+ return ref_text , current_pos + len (ref_text )
859847 else :
860- # Process children in document order
861- current_pos = start_pos
848+ # Process children in document order and accumulate their cleaned text
849+ text_parts = []
850+ pos = current_pos
862851 for child in node .children :
863- current_pos = find_ref_positions (child , current_pos )
864- return current_pos
852+ child_text , new_pos = traverse_and_collect (child , pos )
853+ if child_text is not None :
854+ text_parts .append (child_text )
855+ pos = new_pos
856+ return "" .join (text_parts ), pos
865857 else :
866- # This is a text node (NavigableString) - advance position by text length
858+ # This is a text node (NavigableString) - be more careful with cleaning
867859 text_content = str (node )
868- return start_pos + len (text_content )
869-
870- # Start traversal from the beginning
871- find_ref_positions (element , 0 )
872860
873- return refs
861+ # For text nodes, we need to be more careful about whitespace
862+ # Only apply the full cleaning at the end for the complete text
863+ return text_content , current_pos + len (text_content )
864+
865+ # Build raw text with accurate positions first
866+ raw_text , _ = traverse_and_collect (element , 0 )
867+
868+ # Now apply the same cleaning as get_formatted_passage to the complete text
869+ final_text = _clean_text (raw_text )
870+
871+ # Adjust all reference offsets to match the cleaned text
872+ final_refs = []
873+ for ref in refs :
874+ # Find the reference text in the cleaned text to get correct offsets
875+ ref_text = ref ['text' ]
876+
877+ # The reference text was also cleaned, so we need to find it in the final cleaned text
878+ # We can search around the original position to find the correct occurrence
879+ search_start = max (0 , ref ['offset_start' ] - 10 ) # Look a bit before the original position
880+ search_end = min (len (final_text ), ref ['offset_start' ] + 10 ) # Look a bit after
881+ search_area = final_text [search_start :search_end ]
882+
883+ # Find the reference in the search area
884+ relative_pos = search_area .find (ref_text )
885+ if relative_pos != - 1 :
886+ final_start = search_start + relative_pos
887+ final_end = final_start + len (ref_text )
888+
889+ final_refs .append ({
890+ "type" : ref ["type" ],
891+ "target" : ref ["target" ],
892+ "text" : ref_text ,
893+ "offset_start" : final_start ,
894+ "offset_end" : final_end
895+ })
896+
897+ return final_refs
874898
875899
876900def get_formatted_passage (head_paragraph , head_section , paragraph_id , element ):
0 commit comments