Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ extension-module = ["pyo3/extension-module"]
bytes = "1"
criterion = "0.8"
proptest = "1.9"
tempfile = "3"

[[bench]]
name = "generators"
Expand Down
67 changes: 67 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,73 @@ sql = records_sql(1000, {
For large batches, multiple INSERT statements are generated with up to 1000 rows
each. Column names are double-quoted and string values use single-quote escaping.

### Streaming File Writer

For datasets that exceed available memory, `records_to_file()` generates records
in bounded-memory chunks and writes each chunk to disk before generating the next.
Memory usage is proportional to `chunk_size`, not total `n`.

```python
from forgery import Faker

fake = Faker()
fake.seed(42)

# Generate 100 million records — memory stays at ~500-800 MB
fake.records_to_file(
100_000_000,
{"id": "uuid", "name": "name", "amount": ("float", 0.01, 9999.99)},
"transactions.parquet",
chunk_size=1_000_000, # records per chunk (default: 1M, max: 10M)
)
```

**Supported formats:** CSV (`.csv`), NDJSON (`.ndjson`/`.jsonl`), SQL (`.sql`),
Parquet (`.parquet`). Format is auto-detected from the file extension, or set
explicitly with `format="csv"`.

SQL format requires a `table` parameter:

```python
from forgery import records_to_file, seed

seed(42)
records_to_file(
50_000_000,
{"name": "name", "email": "email"},
"users.sql",
table="users",
chunk_size=500_000,
)
```

**Progress callback** — track progress with an optional callback:

```python
from forgery import records_to_file, seed

seed(42)
records_to_file(
10_000_000,
{"name": "name", "email": "email"},
"users.csv",
on_progress=lambda written, total: print(f"\r{written/total:.0%}", end=""),
)
```

**Memory estimation** — plan chunk sizes based on available RAM:

```python
from forgery import Faker

schema = {"id": "uuid", "name": "name", "amount": ("float", 0.01, 9999.99)}
est = Faker.estimate_memory(1_000_000, schema)
print(f"~{est / 1024**2:.0f} MB per 1M records")
```

All streaming formats use row-major generation, so the same seed produces
identical data across CSV, NDJSON, SQL, and Parquet output.

### Schema Field Types

| Type | Syntax | Example |
Expand Down
78 changes: 77 additions & 1 deletion python/forgery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
>>> german_fake.names(10) # German names
"""

from collections.abc import Coroutine
from collections.abc import Callable, Coroutine
from typing import TYPE_CHECKING, Any

from forgery._forgery import Faker
Expand Down Expand Up @@ -129,6 +129,7 @@
"ean13s",
"email",
"emails",
"estimate_memory",
"fake",
"file_extension",
"file_extensions",
Expand Down Expand Up @@ -225,6 +226,7 @@
"records_ndjson",
"records_parquet",
"records_sql",
"records_to_file",
"records_tuples",
"records_tuples_async",
"remove_provider",
Expand Down Expand Up @@ -1491,6 +1493,80 @@ def records_sql(n: int, schema: Schema, table: str) -> str:
return fake.records_sql(n, schema, table)


# === Streaming File Writer ===


def records_to_file(
n: int,
schema: Schema,
path: str,
*,
format: str | None = None,
chunk_size: int | None = None,
table: str | None = None,
on_progress: Callable[[int, int], None] | None = None,
) -> int:
"""Generate records and stream them to a file in chunks.

Memory stays bounded by chunk_size regardless of total n, enabling
generation of datasets far larger than available RAM.

Supported formats: csv, ndjson, sql, parquet (auto-detected from
file extension, or specified explicitly).

Args:
n: Total number of records to generate.
schema: Dictionary mapping field names to type specifications.
path: Output file path.
format: Output format ("csv", "ndjson", "sql", "parquet") or
None to auto-detect from the file extension.
chunk_size: Records per chunk (default: 1,000,000). Max: 10,000,000.
table: Table name (required for SQL format, ignored otherwise).
on_progress: Optional callback(records_written, total) called
after each chunk is written.

Returns:
The total number of records written.

Example:
>>> from forgery import records_to_file, seed
>>> seed(42)
>>> records_to_file(100, {"name": "name", "email": "email"}, "/tmp/test.csv")
100
"""
return fake.records_to_file(
n,
schema,
path,
format=format,
chunk_size=chunk_size,
table=table,
on_progress=on_progress,
)


def estimate_memory(n: int, schema: Schema) -> int:
"""Estimate memory usage in bytes for generating n records.

Provides a rough estimate based on average field sizes. Useful for
deciding chunk_size for records_to_file().

Args:
n: Number of records.
schema: Dictionary mapping field names to type specifications.

Returns:
Estimated memory in bytes.

Example:
>>> from forgery import estimate_memory
>>> est = estimate_memory(1_000_000, {"name": "name", "email": "email"})
>>> est > 0
True
"""
return Faker.estimate_memory(n, schema)


# === Async Records Generation ===


Expand Down
48 changes: 47 additions & 1 deletion python/forgery/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Type stubs for the forgery package."""

from collections.abc import Coroutine
from collections.abc import Callable, Coroutine
from typing import Any

from forgery._forgery import CreditCardFull as CreditCardFull
Expand Down Expand Up @@ -752,6 +752,52 @@ def records_sql(n: int, schema: Schema, table: str) -> str:
"""
...

# Streaming file writer

def records_to_file(
n: int,
schema: Schema,
path: str,
*,
format: str | None = None,
chunk_size: int | None = None,
table: str | None = None,
on_progress: Callable[[int, int], None] | None = None,
) -> int:
"""Generate records and stream them to a file in chunks.

Memory stays bounded by chunk_size regardless of total n.

Args:
n: Total number of records to generate.
schema: Dictionary mapping field names to type specifications.
path: Output file path.
format: Output format ("csv", "ndjson", "sql", "parquet") or None.
chunk_size: Records per chunk (default: 1,000,000).
table: Table name (required for SQL format).
on_progress: Optional callback(records_written, total).

Returns:
The total number of records written.

Raises:
ValueError: If schema is invalid or format is unsupported.
OSError: If file cannot be created.
"""
...

def estimate_memory(n: int, schema: Schema) -> int:
"""Estimate memory usage in bytes for generating n records.

Args:
n: Number of records.
schema: Dictionary mapping field names to type specifications.

Returns:
Estimated memory in bytes.
"""
...

# Async Records generation

def records_async(
Expand Down
71 changes: 70 additions & 1 deletion python/forgery/_forgery.pyi
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Type stubs for the Rust extension module."""

import builtins
from collections.abc import Coroutine
from collections.abc import Callable, Coroutine
from typing import Any, TypedDict

class CreditCardFull(TypedDict):
Expand Down Expand Up @@ -1095,6 +1095,75 @@ class Faker:
"""
...

# Serialized output formats
def records_csv(self, n: int, schema: Schema) -> str:
"""Generate records as a CSV string with header row."""
...

def records_json(self, n: int, schema: Schema) -> str:
"""Generate records as a JSON array string."""
...

def records_ndjson(self, n: int, schema: Schema) -> str:
"""Generate records as newline-delimited JSON."""
...

def records_parquet(self, n: int, schema: Schema) -> bytes:
"""Generate records as Parquet file bytes."""
...

def records_sql(self, n: int, schema: Schema, table: str) -> str:
"""Generate records as SQL INSERT statements."""
...

# Streaming file writer
def records_to_file(
self,
n: int,
schema: Schema,
path: str,
format: str | None = None,
chunk_size: int | None = None,
table: str | None = None,
on_progress: Callable[[int, int], None] | None = None,
) -> int:
"""Generate records and stream them to a file in chunks.

Memory stays bounded by chunk_size regardless of total n.
Supports: csv, ndjson, sql, parquet (auto-detected from file extension).

Args:
n: Total number of records to generate.
schema: Dictionary mapping field names to type specifications.
path: Output file path.
format: Output format or None for auto-detect from extension.
chunk_size: Records per chunk (default: 1,000,000).
table: Table name (required for SQL format, ignored otherwise).
on_progress: Optional callback(records_written, total) after each chunk.

Returns:
The total number of records written.

Raises:
ValueError: If schema is invalid, format is unsupported, or SQL
format is used without a table name.
OSError: If the file cannot be created or written.
"""
...

@staticmethod
def estimate_memory(n: int, schema: Schema) -> int:
"""Estimate memory usage in bytes for generating n records.

Args:
n: Number of records.
schema: Schema dictionary.

Returns:
Estimated memory in bytes.
"""
...

# Async records generators
def records_async(
self, n: int, schema: Schema, chunk_size: int | None = None
Expand Down
Loading
Loading