@@ -110,7 +110,8 @@ def _get_text_sources(self):
110110 # Original behavior - single text source
111111 self ._text_sources = [{'id' : None , 'text' : str (self .document .text )}]
112112 else : # mode == 'data'
113- self ._text_sources = [{'id' :data_tuple [0 ],'text' :data_tuple [2 ]} for data_tuple in self .document .data_tuples if data_tuple [1 ] in ['text' ,'title' ,'textsmall' ,'table_preamble' , 'table_footnote' , 'table_postamble' ]]
113+ self ._text_sources = [{'id' :data_tuple [0 ],'text' :data_tuple [2 ]} for data_tuple in self .document .data_tuples if data_tuple [1 ] in ['text' ,'title' ,'textsmall' ,'table_preamble' , 'table_footnote' , 'table_postamble' ] and data_tuple [2 ] is not None ]
114+
114115 return self ._text_sources
115116
116117 def _format_results (self , results , fragment_id ):
@@ -457,50 +458,48 @@ def parse_tables(self, must_exist_in_mapping=True):
457458 tables = Tables (document_type = self .type , accession = self .accession )
458459 data_tuples = self .data_tuples
459460
460- # Group table components by id
461- i = 0
462- while i < len (data_tuples ):
463- id , type , content , level = data_tuples [i ]
461+ # Group all tuples by ID first
462+ tuples_by_id = {}
463+ for id , type , content , level in data_tuples :
464+ if id not in tuples_by_id :
465+ tuples_by_id [id ] = []
466+ tuples_by_id [id ].append ((type , content ))
467+
468+ # Now process each ID that has table_data
469+ for id , tuples in tuples_by_id .items ():
470+ # Check if this ID contains a table
471+ if not any (t [0 ] == 'table_data' for t in tuples ):
472+ continue
464473
465- if type == "table_data" :
466- # Start collecting table components
467- table_data = content
468- preamble = None
469- footnotes = []
470- postamble = None
471-
472- # Look backwards for preamble (same id, comes before table_data)
473- j = i - 1
474- while j >= 0 and data_tuples [j ][0 ] == id :
475- _ , comp_type , comp_content , _ = data_tuples [j ]
476- if comp_type == "table_preamble" :
477- preamble = comp_content
478- break
479- j -= 1
480-
481- # Look forwards for footnotes and postamble (same id, comes after table_data)
482- j = i + 1
483- while j < len (data_tuples ) and data_tuples [j ][0 ] == id :
484- _ , comp_type , comp_content , _ = data_tuples [j ]
485- if comp_type == "table_footnote" :
486- footnotes .append (comp_content )
487- elif comp_type == "table_postamble" :
488- postamble = comp_content
489- j += 1
490-
491- # Add the complete table with all components
492- tables .add_table (
493- data = table_data ,
494- name = "extracted_table" ,
495- preamble = preamble ,
496- footnotes = footnotes if footnotes else None ,
497- postamble = postamble
498- )
474+ # Extract all components
475+ table_title = None
476+ table_data = None
477+ preamble = None
478+ footnotes = []
479+ postamble = None
499480
500- i += 1
501-
481+ for type , content in tuples :
482+ if type == 'table_title' :
483+ table_title = content
484+ elif type == 'table_data' :
485+ table_data = content
486+ elif type == 'table_preamble' :
487+ preamble = content
488+ elif type == 'table_footnote' :
489+ footnotes .append (content )
490+ elif type == 'table_postamble' :
491+ postamble = content
492+
493+ # Add the complete table
494+ tables .add_table (
495+ data = table_data ,
496+ name = table_title if table_title is not None else "extracted_table" ,
497+ preamble = preamble ,
498+ footnotes = footnotes if footnotes else None ,
499+ postamble = postamble
500+ )
501+
502502 self ._tables = tables
503-
504503 else :
505504 self ._tables = []
506505 @property
@@ -633,12 +632,13 @@ def get_section(self, title=None, title_regex=None,title_class=None, format='dic
633632 return [flatten_dict (item [1 ],format ) for item in result ]
634633
635634
636- def get_tables (self , description_regex = None , description_fields = ['preamble' , 'postamble' , 'footnotes' ], name = None , contains_regex = None ):
635+ def get_tables (self , description_regex = None , description_fields = ['preamble' , 'postamble' , 'footnotes' ], name = None , contains_regex = None , title_regex = None ):
637636 # make sure tables is initialized
638637 self .tables
639638 return self ._tables .get_tables (
640639 description_regex = description_regex ,
641640 description_fields = description_fields ,
642641 name = name ,
643- contains_regex = contains_regex
642+ contains_regex = contains_regex ,
643+ title_regex = title_regex
644644 )
0 commit comments