Skip to content

Commit 2fe398d

Browse files
committed
secsgml2 integrated
1 parent 6525a16 commit 2fe398d

File tree

6 files changed

+40
-21
lines changed

6 files changed

+40
-21
lines changed

datamule/datamule/datamule/downloader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def decompress_and_parse_and_write(self, compressed_chunks, filename, keep_docum
165165
content = decompressed_content.getvalue()
166166

167167
metadata, documents = parse_sgml_content_into_memory(
168-
bytes_content=content,
168+
data=content,
169169
filter_document_types=keep_document_types
170170
)
171171

@@ -187,7 +187,7 @@ def parse_and_write_regular_file(self, chunks, filename, keep_document_types, ta
187187
content = b''.join(chunks)
188188

189189
metadata, documents = parse_sgml_content_into_memory(
190-
bytes_content=content,
190+
data=content,
191191
filter_document_types=keep_document_types
192192
)
193193

datamule/datamule/document/document.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -253,8 +253,17 @@ def __init__(self, type, content, extension,accession,filing_date,path=None):
253253

254254
if path is not None:
255255
# need to think through document parsing w/ and w/o path... e.g. from url, metadata should fill it
256-
self.path = Path(path)
257-
self.filename = self.path.stem
256+
path_str = str(path)
257+
if isinstance(path, Path) or "::" not in path_str:
258+
self.path = Path(path)
259+
else:
260+
# Preserve virtual "tar::inner/path" references
261+
self.path = path
262+
263+
filename_source = path_str
264+
if "::" in filename_source:
265+
filename_source = filename_source.split("::", 1)[1]
266+
self.filename = Path(filename_source).stem
258267

259268

260269

@@ -573,4 +582,4 @@ def get_tables(self, description_regex=None, description_fields=['preamble', 'po
573582
name=name,
574583
contains_regex=contains_regex,
575584
title_regex=title_regex
576-
)
585+
)

datamule/datamule/portfolio/portfolio_compression_utils_legacy.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -159,14 +159,16 @@ def decompress_portfolio(self, portfolio, max_workers=None):
159159
output_dir.mkdir(exist_ok=True)
160160

161161
# Get all files for this accession
162-
accession_files = [m for m in tar.getmembers()
163-
if m.name.startswith(f'{accession_dir}/') and m.isfile()]
162+
accession_files = [
163+
m.name for m in tar.getmembers()
164+
if m.name.startswith(f'{accession_dir}/') and m.isfile()
165+
]
164166

165167
# Parallel file extraction
166168
with ThreadPoolExecutor(max_workers=max_workers) as executor:
167169
file_futures = [
168-
executor.submit(self._extract_file, member, tar, accession_dir, output_dir)
169-
for member in accession_files
170+
executor.submit(self._extract_file, member_name, batch_tar, accession_dir, output_dir)
171+
for member_name in accession_files
170172
]
171173

172174
# Wait for all files to be processed
@@ -209,12 +211,17 @@ def _process_document(self, doc, compression, threshold, compression_level):
209211

210212
return content, compression_type
211213

212-
def _extract_file(self, member, tar, accession_dir, output_dir):
214+
def _extract_file(self, member_name, batch_tar_path, accession_dir, output_dir):
213215
"""Extract and decompress a single file from tar."""
214-
relative_path = member.name[len(accession_dir)+1:] # Remove accession prefix
216+
relative_path = member_name[len(accession_dir)+1:] # Remove accession prefix
215217
output_path = output_dir / relative_path
216218

217-
content = tar.extractfile(member).read()
219+
# Open the tar per thread to avoid concurrent reads on a shared TarFile handle.
220+
with tarfile.open(batch_tar_path, 'r') as tar:
221+
extracted = tar.extractfile(member_name)
222+
if extracted is None:
223+
return
224+
content = extracted.read()
218225

219226
# Handle decompression based on filename
220227
if relative_path.endswith('.gz'):
@@ -285,4 +292,4 @@ def _write_submission_to_tar(self, tar_handle, submission, documents, compressio
285292

286293
tarinfo = tarfile.TarInfo(name=f'{accession_prefix}/{filename}')
287294
tarinfo.size = len(content)
288-
tar_handle.addfile(tarinfo, io.BytesIO(content))
295+
tar_handle.addfile(tarinfo, io.BytesIO(content))

datamule/datamule/sec/submissions/downloader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from ...helper import _process_cik_and_metadata_filters
55
import tarfile
66
import io
7-
from secsgml2.parse_sgml import parse_sgml_content_into_memory
7+
from secsgml2 import parse_sgml_content_into_memory
88
from secsgml2.utils import calculate_documents_locations_in_tar
99

1010

@@ -50,10 +50,10 @@ def write_sgml_file_to_tar(output_path, bytes_content=None, input_path=None,filt
5050

5151
with open(input_path, 'rb') as f:
5252
bytes_content = f.read()
53-
metadata, documents = parse_sgml_content_into_memory(bytes_content=bytes_content,filter_document_types=filter_document_types)
53+
metadata, documents = parse_sgml_content_into_memory(data=bytes_content,filter_document_types=filter_document_types)
5454
else:
5555
# Use content directly
56-
metadata, documents = parse_sgml_content_into_memory(bytes_content=bytes_content, filter_document_types=filter_document_types)
56+
metadata, documents = parse_sgml_content_into_memory(data=bytes_content, filter_document_types=filter_document_types)
5757

5858
write_submission_to_tar(output_path,metadata,documents)
5959

datamule/datamule/submission/submission.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,13 @@ def __init__(self, path=None, sgml_content=None, keep_document_types=None,
117117
raise ValueError(f"URL: {url}, Error: {response.getcode()}")
118118

119119
self.path = None
120-
metadata, raw_documents = parse_sgml_content_into_memory(sgml_content)
120+
metadata, raw_documents = parse_sgml_content_into_memory(data=sgml_content)
121121

122122

123123

124124
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=None)
125-
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
125+
fd = self.metadata.content.get('filing-date')
126+
self.filing_date = f"{fd[:4]}-{fd[4:6]}-{fd[6:8]}" if fd else None
126127

127128
self.documents_obj_list = []
128129
filtered_metadata_documents = []
@@ -158,7 +159,8 @@ def __init__(self, path=None, sgml_content=None, keep_document_types=None,
158159

159160
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
160161

161-
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
162+
fd = self.metadata.content.get('filing-date')
163+
self.filing_date = f"{fd[:4]}-{fd[4:6]}-{fd[6:8]}" if fd else None
162164

163165
elif path is not None:
164166
self.path = Path(path)
@@ -177,7 +179,8 @@ def __init__(self, path=None, sgml_content=None, keep_document_types=None,
177179
metadata = json.load(f)
178180

179181
self.metadata = Document(type='submission_metadata', content=metadata, extension='.json',filing_date=None,accession=None,path=metadata_path)
180-
self.filing_date= f"{self.metadata.content['filing-date'][:4]}-{self.metadata.content['filing-date'][4:6]}-{self.metadata.content['filing-date'][6:8]}"
182+
fd = self.metadata.content.get('filing-date')
183+
self.filing_date = f"{fd[:4]}-{fd[4:6]}-{fd[6:8]}" if fd else None
181184

182185

183186
# booleans

datamule/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
setup(
3333
name="datamule",
3434
author="John Friedman",
35-
version="3.3.0",
35+
version="3.4.0",
3636
description="Work with SEC submissions at scale.",
3737
packages=find_packages(include=['datamule', 'datamule.*']),
3838
url="https://github.com/john-friedman/datamule-python",

0 commit comments

Comments
 (0)