Skip to content

Commit 905182f

Browse files
committed
fixed probe uuencoding
1 parent cf3740b commit 905182f

File tree

3 files changed

+36
-27
lines changed

3 files changed

+36
-27
lines changed

datamule/datamule/datamule/tar_downloader.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ def close_all(self):
393393
async def download_and_process(self, session, url, semaphore, extraction_pool, tar_manager, output_dir, pbar, keep_document_types, keep_filtered_metadata):
394394
async with semaphore:
395395
filename = url.split('/')[-1]
396+
396397
accession_num = filename.replace('.tar', '').split('/')[-1]
397398

398399
api_key = self.api_key
@@ -438,6 +439,9 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
438439
extraction_pool,
439440
partial(self._extract_documents_from_probe, probe_bytes, metadata_with_positions, keep_document_types)
440441
)
442+
443+
for doc in documents:
444+
doc['content'] = should_decode_file_from_content(doc['content'])
441445

442446
elif keep_document_types == ['metadata']:
443447
# Only metadata requested
@@ -462,7 +466,7 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
462466
for doc in probe_documents:
463467
doc['content'] = should_decode_file_from_content(doc['content'])
464468
documents.extend(probe_documents)
465-
469+
466470
# Download each document beyond probe individually
467471
if docs_beyond_probe:
468472
for doc in docs_beyond_probe:
@@ -492,8 +496,9 @@ async def download_and_process(self, session, url, semaphore, extraction_pool, t
492496
extraction_pool,
493497
partial(self._decompress_zstd, doc_content)
494498
)
495-
499+
496500
decompressed = should_decode_file_from_content(decompressed)
501+
497502

498503
documents.append({
499504
'name': doc_name,
@@ -584,7 +589,6 @@ def download(self, accession_numbers, output_dir="downloads",
584589
for accession in accession_numbers:
585590
url = f"{self.BASE_URL}{format_accession(accession,'no-dash').zfill(18)}.tar"
586591
urls.append(url)
587-
588592
# Deduplicate URLs
589593
urls = list(set(urls))
590594

datamule/datamule/utils/pdf.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,30 @@
1-
def has_extractable_text(pdf_bytes, search_range=50000):
1+
import pypdfium2 as pdfium
2+
3+
def has_extractable_text(pdf_bytes):
24
"""
3-
Check if PDF contains extractable text within first N bytes
4-
Returns True if found in range, False otherwise
5-
6-
Args:
7-
pdf_bytes: PDF content as bytes
8-
search_range: Number of bytes to search from start (default 50KB)
5+
Lightweight check if PDF has extractable text.
6+
Only checks first page.
97
"""
10-
# Text indicators to search for
11-
indicators = [
12-
b'BT', # Begin text - most common
13-
b'Tj', # Show text
14-
b'TJ', # Show text with positioning
15-
b'Tf', # Set font
16-
]
17-
18-
# Search only within the specified range
19-
search_data = pdf_bytes[:search_range]
20-
21-
for indicator in indicators:
22-
if indicator in search_data:
23-
return True
24-
25-
return False
8+
try:
9+
# Load PDF from bytes
10+
pdf = pdfium.PdfDocument(pdf_bytes)
11+
12+
# Only check first page
13+
if len(pdf) == 0:
14+
return False
15+
16+
page = pdf[0]
17+
textpage = page.get_textpage()
18+
text = textpage.get_text_range()
19+
20+
# Close resources
21+
textpage.close()
22+
page.close()
23+
pdf.close()
24+
25+
# Check if we got any text (after stripping whitespace)
26+
return len(text.strip()) > 0
27+
28+
except Exception as e:
29+
# If can't open/parse, assume no text
30+
return False

datamule/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
setup(
3333
name="datamule",
3434
author="John Friedman",
35-
version="3.2.9",
35+
version="3.3.0",
3636
description="Work with SEC submissions at scale.",
3737
packages=find_packages(include=['datamule', 'datamule.*']),
3838
url="https://github.com/john-friedman/datamule-python",

0 commit comments

Comments
 (0)