Skip to content

Commit 339d030

Browse files
committed
added book download datasets;
1 parent 4b336ee commit 339d030

6 files changed

Lines changed: 447 additions & 64 deletions

File tree

datamule/datamule/book/book.py

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,38 @@
1-
from .s3transfer import s3_transfer
1+
import os
2+
from .s3transfer import s3_transfer as _s3_transfer
3+
from .download_dataset_from_s3 import download_dataset as _download_dataset
24

35
class Book:
4-
def __init__(self):
5-
pass
6+
def __init__(self, api_key=None):
7+
if api_key is not None:
8+
self._api_key = api_key
69

7-
def s3_transfer(self, datamule_bucket, s3_credentials, max_workers=4, errors_json_filename='s3_transfer_errors.json', retry_errors=3,
8-
force_daily=True, cik=None, submission_type=None, filing_date=None, datamule_api_key=None,accession=None):
9-
10-
s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials, max_workers=max_workers,
11-
errors_json_filename=errors_json_filename, retry_errors=retry_errors,
12-
force_daily=force_daily, cik=cik, submission_type=submission_type,
13-
filing_date=filing_date, datamule_api_key=datamule_api_key,accession_number=accession)
14-
10+
@property
11+
def api_key(self):
12+
return getattr(self, '_api_key', None) or os.getenv('DATAMULE_API_KEY')
1513

16-
def download_filings_processed_r2():
17-
pass
14+
@api_key.setter
15+
def api_key(self, value):
16+
if not value:
17+
raise ValueError("API key cannot be empty")
18+
self._api_key = value
1819

20+
def s3_transfer(self, datamule_bucket, s3_credentials, max_workers=4,
21+
errors_json_filename='s3_transfer_errors.json', retry_errors=3,
22+
force_daily=True, cik=None, submission_type=None, filing_date=None,
23+
api_key=None, accession=None):
24+
25+
# Use provided key, or fall back to instance property
26+
api_key = api_key or self.api_key
27+
28+
_s3_transfer(datamule_bucket=datamule_bucket, s3_credentials=s3_credentials,
29+
max_workers=max_workers, errors_json_filename=errors_json_filename,
30+
retry_errors=retry_errors, force_daily=force_daily, cik=cik,
31+
submission_type=submission_type, filing_date=filing_date,
32+
api_key=api_key, accession_number=accession)
33+
34+
def download_dataset(self, dataset, filename=None, api_key=None):
35+
# Use provided key, or fall back to instance property
36+
api_key = api_key or self.api_key
37+
38+
_download_dataset(dataset=dataset, filename=filename, api_key=api_key)
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import urllib.request
2+
import urllib.parse
3+
from tqdm import tqdm
4+
import json
5+
6+
# Dataset name mapping - lowercase underscore to official name
7+
DATASET_NAME_MAP = {
8+
'sec_accessions': 'SEC Accessions Master Index',
9+
'sec_master_submissions': 'SEC Master Submissions Table',
10+
'sec_accession_cik_table': 'SEC Accession CIK Table',
11+
'sec_documents_table': 'SEC Documents Table',
12+
'sec_submission_details_table': 'SEC Submissions Details Table',
13+
'simple_xbrl_table': 'Simple XBRL Table',
14+
'proxy_voting_records_table': 'Proxy Voting Records Table',
15+
'institutional_holdings_table': 'Institutional Holdings Table',
16+
'metadata_ownership_table': 'Insider Ownership Metadata Table',
17+
'reporting_owner_ownership_table': 'Insider Reporting Owner Table',
18+
'non_derivative_transaction_ownership_table': 'Insider Non-Derivative Transactions Table',
19+
'non_derivative_holding_ownership_table': 'Insider Non-Derivative Holdings Table',
20+
'derivative_transaction_ownership_table': 'Insider Derivative Transactions Table',
21+
'derivative_holding_ownership_table': 'Insider Derivative Holdings Table',
22+
'owner_signature_ownership_table': 'Insider Owner Signatures Table',
23+
}
24+
25+
26+
def download_dataset(dataset, api_key, filename=None):
27+
"""
28+
Download a dataset from Datamule API
29+
30+
Args:
31+
dataset: Dataset name (lowercase underscore format, e.g. 'sec_accessions')
32+
api_key: Datamule API key
33+
filename: Output filename (optional, extracted from URL if not provided)
34+
"""
35+
# Map dataset name to official name
36+
dataset_name = DATASET_NAME_MAP.get(dataset)
37+
if not dataset_name:
38+
raise ValueError(f"Unknown dataset: {dataset}")
39+
40+
# Get download URL from API
41+
api_url = f"https://api.datamule.xyz/dataset/{urllib.parse.quote(dataset_name)}?api_key={api_key}"
42+
43+
# Create request with headers
44+
req = urllib.request.Request(
45+
api_url,
46+
headers={
47+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
48+
}
49+
)
50+
51+
try:
52+
with urllib.request.urlopen(req) as response:
53+
data = json.loads(response.read().decode())
54+
except urllib.error.HTTPError as e:
55+
error_body = e.read().decode()
56+
raise Exception(f"API request failed: {error_body}")
57+
58+
if not data.get('success'):
59+
raise Exception(f"API error: {data.get('error', 'Unknown error')}")
60+
61+
download_url = data['data']['download_url']
62+
size_gb = data['data']['size_gb']
63+
64+
# Extract filename from URL if not provided
65+
if filename is None:
66+
# Parse the path parameter from the download URL
67+
parsed = urllib.parse.urlparse(download_url)
68+
query_params = urllib.parse.parse_qs(parsed.query)
69+
path = query_params.get('path', [''])[0]
70+
# Get the filename from the path (last part after /)
71+
filename = urllib.parse.unquote(path.split('/')[-1])
72+
if not filename:
73+
filename = f"{dataset}.download"
74+
75+
# Download file with progress bar
76+
print(f"Downloading {dataset} ({size_gb:.2f} GB)...")
77+
78+
# Create request with headers for download
79+
download_req = urllib.request.Request(
80+
download_url,
81+
headers={
82+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
83+
}
84+
)
85+
86+
try:
87+
with urllib.request.urlopen(download_req) as response:
88+
total_size = int(response.headers.get('Content-Length', 0))
89+
90+
with open(filename, 'wb') as f, tqdm(
91+
total=total_size,
92+
unit='B',
93+
unit_scale=True,
94+
desc=filename
95+
) as pbar:
96+
while True:
97+
chunk = response.read(8192)
98+
if not chunk:
99+
break
100+
f.write(chunk)
101+
pbar.update(len(chunk))
102+
except urllib.error.HTTPError as e:
103+
error_body = e.read().decode()
104+
raise Exception(f"Download failed: {error_body}")
105+
106+
print(f"Downloaded to {filename}")

datamule/datamule/portfolio/portfolio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ def download_submissions(self, cik=None, ticker=None, submission_type=None, fili
222222

223223
# map legacy provider
224224
if provider == 'datamule':
225-
provider = 'datamule-sgml'
225+
provider = 'datamule-tar'
226226

227227
if provider == 'datamule-sgml':
228228
seclibrary_download(

datamule/docs-rewrite/docs/datamule-python/book/book.md

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,84 @@
33
Book is a class for interacting with datamule's S3 Layer.
44

55

6+
## `download_dataset`
7+
8+
Download pre-built datasets directly from Datamule.
9+
10+
Available datasets:
11+
12+
- **sec_accessions**: SEC Accessions Master Index - newline-delimited text file of all accession numbers
13+
14+
- **sec_master_submissions**: SEC Master Submissions Table - comprehensive master index with filing metadata
15+
- Columns: cik (Int64), accessionNumber (String), filingDate (Date), submissionType (String), reportDate (Date), acceptanceDateTime (Datetime), act (String), fileNumber (String), filmNumber (String), items (String), size (Int64), isXBRL (Boolean), isInlineXBRL (Boolean)
16+
17+
- **sec_accession_cik_table**: SEC Accession CIK Table - links accession numbers to company CIKs
18+
- Columns: accessionNumber (BIGINT UNSIGNED), cik (BIGINT UNSIGNED)
19+
20+
- **sec_documents_table**: SEC Documents Table - document-level details for all files within filings
21+
- Columns: accessionNumber (BIGINT UNSIGNED), documentType (VARCHAR(128)), sequence (SMALLINT), filename (VARCHAR(500)), description (VARCHAR(1000)), secsgmlSizeBytes (INT)
22+
23+
- **sec_submission_details_table**: SEC Submissions Details Table - filing metadata with submission types and dates
24+
- Columns: accessionNumber (BIGINT UNSIGNED), submissionType (VARCHAR(16)), filingDate (DATE), reportDate (DATE), detectedTime (DATETIME), containsXBRL (BOOLEAN)
25+
26+
- **simple_xbrl_table**: Simple XBRL Table - parsed XBRL facts from SEC filings
27+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), context_id (BIGINT UNSIGNED), taxonomy (VARCHAR(16)), name (VARCHAR(256)), value (TEXT), period_start_date (DATE), period_end_date (DATE), members (TEXT)
28+
29+
- **proxy_voting_records_table**: Proxy Voting Records Table - institutional investor voting records
30+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), cusip (CHAR(9)), issuerName (VARCHAR(256)), meetingDate (DATE), categoryType (VARCHAR(256)), voteDescription (VARCHAR(8192)), managementRecommendation (VARCHAR(16)), howVoted (VARCHAR(16)), sharesVoted (BIGINT UNSIGNED), sharesOnLoan (BIGINT UNSIGNED)
31+
32+
- **institutional_holdings_table**: Institutional Holdings Table - 13F institutional holdings
33+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), cusip (CHAR(9)), nameOfIssuer (VARCHAR(256)), titleOfClass (VARCHAR(256)), value (BIGINT UNSIGNED), sharesOrPrincipalAmount (BIGINT UNSIGNED), sharesOrPrincipalAmountType (VARCHAR(16)), investmentDiscretion (VARCHAR(16)), putCall (VARCHAR(16)), otherManager (VARCHAR(256)), votingAuthoritySole (BIGINT UNSIGNED), votingAuthorityShared (BIGINT UNSIGNED), votingAuthorityNone (BIGINT UNSIGNED)
34+
35+
- **metadata_ownership_table**: Insider Ownership Metadata Table - filing-level metadata for insider reports
36+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), issuerCik (BIGINT UNSIGNED), issuerName (VARCHAR(128)), issuerTradingSymbol (VARCHAR(16)), documentType (VARCHAR(8)), periodOfReport (DATE), notSubjectToSection16 (VARCHAR(8)), form3HoldingsReported (BIGINT UNSIGNED), form4TransactionsReported (BIGINT UNSIGNED), dateOfOriginalSubmission (DATE), noSecuritiesOwned (BIGINT UNSIGNED), aff10b5One (VARCHAR(8)), schemaVersion (VARCHAR(8))
37+
38+
- **reporting_owner_ownership_table**: Insider Reporting Owner Table - insider details and relationships
39+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), rptOwnerCik (BIGINT UNSIGNED), rptOwnerName (VARCHAR(256)), rptOwnerStreet1 (VARCHAR(64)), rptOwnerStreet2 (VARCHAR(64)), rptOwnerCity (VARCHAR(64)), rptOwnerState (CHAR(4)), rptOwnerStateDescription (VARCHAR(64)), rptOwnerZipCode (VARCHAR(16)), rptOwnerIsDirector (VARCHAR(8)), rptOwnerIsOfficer (VARCHAR(8)), rptOwnerIsTenPercentOwner (VARCHAR(8)), rptOwnerIsOther (VARCHAR(8)), rptOwnerOfficerTitle (VARCHAR(64)), rptOwnerOtherText (VARCHAR(64))
40+
41+
- **non_derivative_transaction_ownership_table**: Insider Non-Derivative Transactions Table - common stock transactions
42+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), securityTitle (VARCHAR(128)), transactionDate (DATE), deemedExecutionDate (DATE), transactionFormType (BIGINT UNSIGNED), transactionCode (CHAR(1)), equitySwapInvolved (VARCHAR(8)), transactionShares (BIGINT UNSIGNED), transactionPricePerShare (BIGINT UNSIGNED), transactionAcquiredDisposedCode (CHAR(1)), sharesOwnedFollowingTransaction (BIGINT UNSIGNED), valueOwnedFollowingTransaction (BIGINT UNSIGNED), directOrIndirectOwnership (CHAR(1)), natureOfOwnership (VARCHAR(256)), transactionTimeliness (CHAR(1))
43+
44+
- **non_derivative_holding_ownership_table**: Insider Non-Derivative Holdings Table - current ownership positions
45+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), securityTitle (VARCHAR(128)), sharesOwnedFollowingTransaction (BIGINT UNSIGNED), valueOwnedFollowingTransaction (BIGINT UNSIGNED), directOrIndirectOwnership (CHAR(1)), natureOfOwnership (VARCHAR(256)), transactionFormType (BIGINT UNSIGNED)
46+
47+
- **derivative_transaction_ownership_table**: Insider Derivative Transactions Table - options, warrants transactions
48+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), securityTitle (VARCHAR(128)), conversionOrExercisePrice (BIGINT UNSIGNED), transactionDate (DATE), deemedExecutionDate (DATE), transactionFormType (BIGINT UNSIGNED), transactionCode (CHAR(1)), equitySwapInvolved (VARCHAR(8)), transactionShares (BIGINT UNSIGNED), transactionPricePerShare (BIGINT UNSIGNED), transactionAcquiredDisposedCode (CHAR(1)), transactionTotalValue (BIGINT UNSIGNED), exerciseDate (DATE), expirationDate (DATE), underlyingSecurityTitle (VARCHAR(128)), underlyingSecurityShares (BIGINT UNSIGNED), underlyingSecurityValue (BIGINT UNSIGNED), sharesOwnedFollowingTransaction (BIGINT UNSIGNED), valueOwnedFollowingTransaction (BIGINT UNSIGNED), directOrIndirectOwnership (CHAR(1)), natureOfOwnership (VARCHAR(256)), transactionTimeliness (CHAR(1))
49+
50+
- **derivative_holding_ownership_table**: Insider Derivative Holdings Table - derivative security positions
51+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), securityTitle (VARCHAR(128)), conversionOrExercisePrice (BIGINT UNSIGNED), exerciseDate (DATE), expirationDate (DATE), underlyingSecurityTitle (VARCHAR(128)), underlyingSecurityShares (BIGINT UNSIGNED), underlyingSecurityValue (BIGINT UNSIGNED), sharesOwnedFollowingTransaction (BIGINT UNSIGNED), valueOwnedFollowingTransaction (BIGINT UNSIGNED), directOrIndirectOwnership (CHAR(1)), natureOfOwnership (VARCHAR(256)), transactionFormType (BIGINT UNSIGNED)
52+
53+
- **owner_signature_ownership_table**: Insider Owner Signatures Table - signature information for filings
54+
- Columns: id (BIGINT UNSIGNED), accessionNumber (BIGINT UNSIGNED), signatureName (VARCHAR(256)), signatureDate (DATE)
55+
56+
### Example
57+
58+
Download a dataset with auto-detected filename:
59+
```python
60+
from datamule import Book
61+
book = Book()
62+
63+
book.download_dataset(
64+
dataset='sec_accessions',
65+
api_key = None # Uses environmental variable if set
66+
)
67+
```
68+
69+
Download with custom filename:
70+
```python
71+
book.download_dataset(
72+
dataset='institutional_holdings_table',
73+
filename='my_holdings_data.parquet'
74+
)
75+
```
76+
77+
### Parameters
78+
79+
- **dataset**: Dataset identifier (lowercase underscore format, e.g. 'sec_accessions')
80+
- **api_key**: Your Datamule API key
81+
- **filename**: Optional output filename. If not provided, extracts filename from download URL with correct extension
82+
83+
684
## `s3_transfer`
785

886
Transfer from datamule S3 to your S3 bucket.

0 commit comments

Comments
 (0)