tweak to bulk tar downloading

john-friedman · john-friedman · commit 91bc1e1e57ee · 2026-03-24T15:46:08.000-07:00
diff --git a/datamule/datamule/datamule/tar_downloader.py b/datamule/datamule/datamule/tar_downloader.py
@@ -14,7 +14,6 @@
 from threading import Lock
 from os import cpu_count
 from secsgml2.utils import calculate_documents_locations_in_tar
-from secsgml2 import decode_uuencoded_content
 from ..utils.format_accession import format_accession
 from ..providers.providers import SEC_FILINGS_TAR_BUCKET_ENDPOINT
 from .datamule_lookup import datamule_lookup
@@ -438,8 +437,6 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
                         partial(self._extract_documents_from_probe, probe_bytes, metadata_with_positions, keep_document_types)
                     )
 
-                    for doc in documents:
-                        doc['content'] = decode_uuencoded_content(doc['content'])
                     
                 elif keep_document_types == ['metadata']:
                     # Only metadata requested
@@ -461,8 +458,6 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
                             partial(self._extract_documents_from_probe_by_list, probe_bytes, docs_in_probe)
                         )
 
-                        for doc in probe_documents:
-                            doc['content'] = decode_uuencoded_content(doc['content'])
                         documents.extend(probe_documents)
 
                     # Download each document beyond probe individually
@@ -495,7 +490,6 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
                                     partial(self._decompress_zstd, doc_content)
                                 )
                             
-                                decompressed = decode_uuencoded_content(decompressed)
 
                                 
                                 documents.append({
diff --git a/datamule/datamule/mapping_dicts/xml_mapping_jsons/ex103.json b/datamule/datamule/mapping_dicts/xml_mapping_jsons/ex103.json
@@ -4,6 +4,7 @@
         "/Comments/commentData/fieldName": "fieldName",
         "/Comments/commentData/itemNumber": "itemNumber",
         "/comments/commentData/Comment": "comment",
+        "/comments/commentData/columnName": "columnName",
         "/comments/commentData/ItemNumber": "itemNumber",
         "/comments/commentData/comment": "comment",
         "/comments/commentData/commentColumn": "commentColumn",
@@ -13,4 +14,4 @@
         "/comments/commentData/fieldName": "fieldName",
         "/comments/commentData/itemNumber": "itemNumber"
     }
-}
+}
diff --git a/datamule/datamule/tables/tableparser.py b/datamule/datamule/tables/tableparser.py
@@ -1,8 +1,6 @@
 import xml.etree.ElementTree as ET
 from io import BytesIO
 
-# Prototype code, change #
-
 def parser(xml_bytes, mapping):
     rows = []
 
@@ -21,16 +19,13 @@ def parser(xml_bytes, mapping):
             _stack.pop()
 
     for table_name, table_mapping in mapping.items():
-        # Filter mapping to only paths present in this file
         table_mapping = {k: v for k, v in table_mapping.items() if k in real_paths}
         if not table_mapping:
             continue
 
-        # Split mapping into text paths and attribute paths
         attr_mapping = {k: v for k, v in table_mapping.items() if "/@" in k}
         text_mapping = {k: v for k, v in table_mapping.items() if "/@" not in k}
 
-        # Segment-wise common prefix (not character-wise)
         base_paths = [k.rsplit("/@", 1)[0] if "/@" in k else k for k in table_mapping.keys()]
         split_paths = [p.strip("/").split("/") for p in base_paths]
         prefix_segments = []
@@ -46,6 +41,14 @@ def parser(xml_bytes, mapping):
         empty_row = {col: None for col in table_mapping.values()}
         current_path = []
         current_row = empty_row.copy()
+        # Track accumulated values for repeating fields
+        accumulator = {col: [] for col in table_mapping.values()}
+
+        def flush_row():
+            # Merge accumulator into current_row as pipe-delimited strings
+            for col, values in accumulator.items():
+                if values:
+                    current_row[col] = "|".join(values)
 
         for event, elem in ET.iterparse(BytesIO(xml_bytes), events=("start", "end")):
             tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag.split(":")[-1]
@@ -57,24 +60,31 @@ def parser(xml_bytes, mapping):
 
                 # Handle text content
                 if path in text_mapping:
-                    current_row[text_mapping[path]] = elem.text
+                    col = text_mapping[path]
+                    if elem.text and elem.text.strip():
+                        accumulator[col].append(elem.text.strip())
 
                 # Handle attributes
                 for attr_name, attr_value in elem.attrib.items():
                     attr_path = f"{path}/@{attr_name}"
                     if attr_path in attr_mapping:
-                        current_row[attr_mapping[attr_path]] = attr_value
+                        col = attr_mapping[attr_path]
+                        accumulator[col].append(attr_value)
 
                 if path == row_boundary:
+                    flush_row()
                     current_row["_table"] = table_name
-                    rows.append(current_row)
+                    if any(v is not None and str(v).strip() for k, v in current_row.items() if k != "_table"):
+                        rows.append(current_row)
                     current_row = empty_row.copy()
+                    accumulator = {col: [] for col in table_mapping.values()}
 
                 current_path.pop()
 
         # Flush if boundary was the root (single-row tables like doc header)
-        if any(v is not None for v in current_row.values()):
+        flush_row()
+        if any(v is not None and str(v).strip() for v in current_row.values()):
             current_row["_table"] = table_name
             rows.append(current_row)
 
-    return rows
+    return rows
diff --git a/datamule/setup.py b/datamule/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="datamule",
     author="John Friedman",
-    version="3.6.3",
+    version="3.6.4",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",