Skip to content

Commit 91bc1e1

Browse files
committed
tweak to bulk tar downloading
1 parent 5f32ef4 commit 91bc1e1

4 files changed

Lines changed: 23 additions & 18 deletions

File tree

datamule/datamule/datamule/tar_downloader.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from threading import Lock
1515
from os import cpu_count
1616
from secsgml2.utils import calculate_documents_locations_in_tar
17-
from secsgml2 import decode_uuencoded_content
1817
from ..utils.format_accession import format_accession
1918
from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
2019
from .datamule_lookup import datamule_lookup
@@ -438,8 +437,6 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
438437
partial(self._extract_documents_from_probe, probe_bytes, metadata_with_positions, keep_document_types)
439438
)
440439

441-
for doc in documents:
442-
doc['content'] = decode_uuencoded_content(doc['content'])
443440

444441
elif keep_document_types == ['metadata']:
445442
# Only metadata requested
@@ -461,8 +458,6 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
461458
partial(self._extract_documents_from_probe_by_list, probe_bytes, docs_in_probe)
462459
)
463460

464-
for doc in probe_documents:
465-
doc['content'] = decode_uuencoded_content(doc['content'])
466461
documents.extend(probe_documents)
467462

468463
# Download each document beyond probe individually
@@ -495,7 +490,6 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
495490
partial(self._decompress_zstd, doc_content)
496491
)
497492

498-
decompressed = decode_uuencoded_content(decompressed)
499493

500494

501495
documents.append({

datamule/datamule/mapping_dicts/xml_mapping_jsons/ex103.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"/Comments/commentData/fieldName": "fieldName",
55
"/Comments/commentData/itemNumber": "itemNumber",
66
"/comments/commentData/Comment": "comment",
7+
"/comments/commentData/columnName": "columnName",
78
"/comments/commentData/ItemNumber": "itemNumber",
89
"/comments/commentData/comment": "comment",
910
"/comments/commentData/commentColumn": "commentColumn",
@@ -13,4 +14,4 @@
1314
"/comments/commentData/fieldName": "fieldName",
1415
"/comments/commentData/itemNumber": "itemNumber"
1516
}
16-
}
17+
}

datamule/datamule/tables/tableparser.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import xml.etree.ElementTree as ET
22
from io import BytesIO
33

4-
# Prototype code, change #
5-
64
def parser(xml_bytes, mapping):
75
rows = []
86

@@ -21,16 +19,13 @@ def parser(xml_bytes, mapping):
2119
_stack.pop()
2220

2321
for table_name, table_mapping in mapping.items():
24-
# Filter mapping to only paths present in this file
2522
table_mapping = {k: v for k, v in table_mapping.items() if k in real_paths}
2623
if not table_mapping:
2724
continue
2825

29-
# Split mapping into text paths and attribute paths
3026
attr_mapping = {k: v for k, v in table_mapping.items() if "/@" in k}
3127
text_mapping = {k: v for k, v in table_mapping.items() if "/@" not in k}
3228

33-
# Segment-wise common prefix (not character-wise)
3429
base_paths = [k.rsplit("/@", 1)[0] if "/@" in k else k for k in table_mapping.keys()]
3530
split_paths = [p.strip("/").split("/") for p in base_paths]
3631
prefix_segments = []
@@ -46,6 +41,14 @@ def parser(xml_bytes, mapping):
4641
empty_row = {col: None for col in table_mapping.values()}
4742
current_path = []
4843
current_row = empty_row.copy()
44+
# Track accumulated values for repeating fields
45+
accumulator = {col: [] for col in table_mapping.values()}
46+
47+
def flush_row():
48+
# Merge accumulator into current_row as pipe-delimited strings
49+
for col, values in accumulator.items():
50+
if values:
51+
current_row[col] = "|".join(values)
4952

5053
for event, elem in ET.iterparse(BytesIO(xml_bytes), events=("start", "end")):
5154
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag.split(":")[-1]
@@ -57,24 +60,31 @@ def parser(xml_bytes, mapping):
5760

5861
# Handle text content
5962
if path in text_mapping:
60-
current_row[text_mapping[path]] = elem.text
63+
col = text_mapping[path]
64+
if elem.text and elem.text.strip():
65+
accumulator[col].append(elem.text.strip())
6166

6267
# Handle attributes
6368
for attr_name, attr_value in elem.attrib.items():
6469
attr_path = f"{path}/@{attr_name}"
6570
if attr_path in attr_mapping:
66-
current_row[attr_mapping[attr_path]] = attr_value
71+
col = attr_mapping[attr_path]
72+
accumulator[col].append(attr_value)
6773

6874
if path == row_boundary:
75+
flush_row()
6976
current_row["_table"] = table_name
70-
rows.append(current_row)
77+
if any(v is not None and str(v).strip() for k, v in current_row.items() if k != "_table"):
78+
rows.append(current_row)
7179
current_row = empty_row.copy()
80+
accumulator = {col: [] for col in table_mapping.values()}
7281

7382
current_path.pop()
7483

7584
# Flush if boundary was the root (single-row tables like doc header)
76-
if any(v is not None for v in current_row.values()):
85+
flush_row()
86+
if any(v is not None and str(v).strip() for v in current_row.values()):
7787
current_row["_table"] = table_name
7888
rows.append(current_row)
7989

80-
return rows
90+
return rows

datamule/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
setup(
44
name="datamule",
55
author="John Friedman",
6-
version="3.6.3",
6+
version="3.6.4",
77
description="Work with SEC submissions at scale.",
88
packages=find_packages(include=['datamule', 'datamule.*']),
99
url="https://github.com/john-friedman/datamule-python",

0 commit comments

Comments
 (0)