Skip to content

Commit e5bd0f6

Browse files
committed
3.2.6
1 parent 5e1ec6e commit e5bd0f6

3 files changed

Lines changed: 53 additions & 48 deletions

File tree

datamule/datamule/document/document.py

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,8 @@ def _get_text_sources(self):
110110
# Original behavior - single text source
111111
self._text_sources = [{'id': None, 'text': str(self.document.text)}]
112112
else: # mode == 'data'
113-
self._text_sources = [{'id':data_tuple[0],'text':data_tuple[2]} for data_tuple in self.document.data_tuples if data_tuple[1] in ['text','title','textsmall','table_preamble', 'table_footnote', 'table_postamble']]
113+
self._text_sources = [{'id':data_tuple[0],'text':data_tuple[2]} for data_tuple in self.document.data_tuples if data_tuple[1] in ['text','title','textsmall','table_preamble', 'table_footnote', 'table_postamble'] and data_tuple[2] is not None]
114+
114115
return self._text_sources
115116

116117
def _format_results(self, results, fragment_id):
@@ -457,50 +458,48 @@ def parse_tables(self, must_exist_in_mapping=True):
457458
tables = Tables(document_type=self.type, accession=self.accession)
458459
data_tuples = self.data_tuples
459460

460-
# Group table components by id
461-
i = 0
462-
while i < len(data_tuples):
463-
id, type, content, level = data_tuples[i]
461+
# Group all tuples by ID first
462+
tuples_by_id = {}
463+
for id, type, content, level in data_tuples:
464+
if id not in tuples_by_id:
465+
tuples_by_id[id] = []
466+
tuples_by_id[id].append((type, content))
467+
468+
# Now process each ID that has table_data
469+
for id, tuples in tuples_by_id.items():
470+
# Check if this ID contains a table
471+
if not any(t[0] == 'table_data' for t in tuples):
472+
continue
464473

465-
if type == "table_data":
466-
# Start collecting table components
467-
table_data = content
468-
preamble = None
469-
footnotes = []
470-
postamble = None
471-
472-
# Look backwards for preamble (same id, comes before table_data)
473-
j = i - 1
474-
while j >= 0 and data_tuples[j][0] == id:
475-
_, comp_type, comp_content, _ = data_tuples[j]
476-
if comp_type == "table_preamble":
477-
preamble = comp_content
478-
break
479-
j -= 1
480-
481-
# Look forwards for footnotes and postamble (same id, comes after table_data)
482-
j = i + 1
483-
while j < len(data_tuples) and data_tuples[j][0] == id:
484-
_, comp_type, comp_content, _ = data_tuples[j]
485-
if comp_type == "table_footnote":
486-
footnotes.append(comp_content)
487-
elif comp_type == "table_postamble":
488-
postamble = comp_content
489-
j += 1
490-
491-
# Add the complete table with all components
492-
tables.add_table(
493-
data=table_data,
494-
name="extracted_table",
495-
preamble=preamble,
496-
footnotes=footnotes if footnotes else None,
497-
postamble=postamble
498-
)
474+
# Extract all components
475+
table_title = None
476+
table_data = None
477+
preamble = None
478+
footnotes = []
479+
postamble = None
499480

500-
i += 1
501-
481+
for type, content in tuples:
482+
if type == 'table_title':
483+
table_title = content
484+
elif type == 'table_data':
485+
table_data = content
486+
elif type == 'table_preamble':
487+
preamble = content
488+
elif type == 'table_footnote':
489+
footnotes.append(content)
490+
elif type == 'table_postamble':
491+
postamble = content
492+
493+
# Add the complete table
494+
tables.add_table(
495+
data=table_data,
496+
name=table_title if table_title is not None else "extracted_table",
497+
preamble=preamble,
498+
footnotes=footnotes if footnotes else None,
499+
postamble=postamble
500+
)
501+
502502
self._tables = tables
503-
504503
else:
505504
self._tables = []
506505
@property
@@ -633,12 +632,13 @@ def get_section(self, title=None, title_regex=None,title_class=None, format='dic
633632
return [flatten_dict(item[1],format) for item in result]
634633

635634

636-
def get_tables(self, description_regex=None, description_fields=['preamble', 'postamble', 'footnotes'], name=None, contains_regex=None):
635+
def get_tables(self, description_regex=None, description_fields=['preamble', 'postamble', 'footnotes'], name=None, contains_regex=None,title_regex=None):
637636
# make sure tables is initialized
638637
self.tables
639638
return self._tables.get_tables(
640639
description_regex=description_regex,
641640
description_fields=description_fields,
642641
name=name,
643-
contains_regex=contains_regex
642+
contains_regex=contains_regex,
643+
title_regex=title_regex
644644
)

datamule/datamule/tables/tables.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,8 @@ def add_table(self, data, name, description=None, preamble=None, footnotes=None,
185185
postamble=postamble
186186
))
187187

188-
def get_tables(self, description_regex=None, description_fields=['preamble', 'postamble', 'footnotes'], name=None, contains_regex=None):
188+
def get_tables(self, description_regex=None, description_fields=['preamble', 'postamble', 'footnotes'], name=None, contains_regex=None, title_regex=None):
189189
matching_tables = []
190-
191190

192191
for table in self.tables:
193192
# Check name match (exact match)
@@ -196,6 +195,12 @@ def get_tables(self, description_regex=None, description_fields=['preamble', 'po
196195
matching_tables.append(table)
197196
continue
198197

198+
199+
if title_regex is not None:
200+
if table.name and re.search(title_regex, table.name):
201+
matching_tables.append(table)
202+
continue
203+
199204
# Check description regex match
200205
if description_regex is not None:
201206
description_matched = False
@@ -226,7 +231,7 @@ def get_tables(self, description_regex=None, description_fields=['preamble', 'po
226231
continue
227232

228233
# Check contains_regex match (only if description_regex didn't already handle it)
229-
if contains_regex is not None and description_regex is None and name is None:
234+
if contains_regex is not None and description_regex is None and name is None and title_regex is None:
230235
if self._check_contains_regex(table, contains_regex):
231236
matching_tables.append(table)
232237

datamule/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
setup(
3333
name="datamule",
3434
author="John Friedman",
35-
version="3.2.5",
35+
version="3.2.6",
3636
description="Work with SEC submissions at scale.",
3737
packages=find_packages(include=['datamule', 'datamule.*']),
3838
url="https://github.com/john-friedman/datamule-python",

0 commit comments

Comments
 (0)