williajm · williajm · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -47,6 +47,7 @@ extension-module = ["pyo3/extension-module"]
 bytes = "1"
 criterion = "0.8"
 proptest = "1.9"
+tempfile = "3"
 
 [[bench]]
 name = "generators"

diff --git a/README.md b/README.md
@@ -673,6 +673,73 @@ sql = records_sql(1000, {
 For large batches, multiple INSERT statements are generated with up to 1000 rows
 each. Column names are double-quoted and string values use single-quote escaping.
 
+### Streaming File Writer
+
+For datasets that exceed available memory, `records_to_file()` generates records
+in bounded-memory chunks and writes each chunk to disk before generating the next.
+Memory usage is proportional to `chunk_size`, not total `n`.
+
+```python
+from forgery import Faker
+
+fake = Faker()
+fake.seed(42)
+
+# Generate 100 million records — memory stays at ~500-800 MB
+fake.records_to_file(
+    100_000_000,
+    {"id": "uuid", "name": "name", "amount": ("float", 0.01, 9999.99)},
+    "transactions.parquet",
+    chunk_size=1_000_000,  # records per chunk (default: 1M, max: 10M)
+)
+```
+
+**Supported formats:** CSV (`.csv`), NDJSON (`.ndjson`/`.jsonl`), SQL (`.sql`),
+Parquet (`.parquet`). Format is auto-detected from the file extension, or set
+explicitly with `format="csv"`.
+
+SQL format requires a `table` parameter:
+
+```python
+from forgery import records_to_file, seed
+
+seed(42)
+records_to_file(
+    50_000_000,
+    {"name": "name", "email": "email"},
+    "users.sql",
+    table="users",
+    chunk_size=500_000,
+)
+```
+
+**Progress callback** — track progress with an optional callback:
+
+```python
+from forgery import records_to_file, seed
+
+seed(42)
+records_to_file(
+    10_000_000,
+    {"name": "name", "email": "email"},
+    "users.csv",
+    on_progress=lambda written, total: print(f"\r{written/total:.0%}", end=""),
+)
+```
+
+**Memory estimation** — plan chunk sizes based on available RAM:
+
+```python
+from forgery import Faker
+
+schema = {"id": "uuid", "name": "name", "amount": ("float", 0.01, 9999.99)}
+est = Faker.estimate_memory(1_000_000, schema)
+print(f"~{est / 1024**2:.0f} MB per 1M records")
+```
+
+All streaming formats use row-major generation, so the same seed produces
+identical data across CSV, NDJSON, SQL, and Parquet output.
+
 ### Schema Field Types
 
 | Type | Syntax | Example |

diff --git a/python/forgery/__init__.py b/python/forgery/__init__.py
@@ -57,7 +57,7 @@
     >>> german_fake.names(10)  # German names
 """
 
-from collections.abc import Coroutine
+from collections.abc import Callable, Coroutine
 from typing import TYPE_CHECKING, Any
 
 from forgery._forgery import Faker
@@ -129,6 +129,7 @@
     "ean13s",
     "email",
     "emails",
+    "estimate_memory",
     "fake",
     "file_extension",
     "file_extensions",
@@ -225,6 +226,7 @@
     "records_ndjson",
     "records_parquet",
     "records_sql",
+    "records_to_file",
     "records_tuples",
     "records_tuples_async",
     "remove_provider",
@@ -1491,6 +1493,80 @@ def records_sql(n: int, schema: Schema, table: str) -> str:
     return fake.records_sql(n, schema, table)
 
 
+# === Streaming File Writer ===
+
+
+def records_to_file(
+    n: int,
+    schema: Schema,
+    path: str,
+    *,
+    format: str | None = None,
+    chunk_size: int | None = None,
+    table: str | None = None,
+    on_progress: Callable[[int, int], None] | None = None,
+) -> int:
+    """Generate records and stream them to a file in chunks.
+
+    Memory stays bounded by chunk_size regardless of total n, enabling
+    generation of datasets far larger than available RAM.
+
+    Supported formats: csv, ndjson, sql, parquet (auto-detected from
+    file extension, or specified explicitly).
+
+    Args:
+        n: Total number of records to generate.
+        schema: Dictionary mapping field names to type specifications.
+        path: Output file path.
+        format: Output format ("csv", "ndjson", "sql", "parquet") or
+                None to auto-detect from the file extension.
+        chunk_size: Records per chunk (default: 1,000,000). Max: 10,000,000.
+        table: Table name (required for SQL format, ignored otherwise).
+        on_progress: Optional callback(records_written, total) called
+                     after each chunk is written.
+
+    Returns:
+        The total number of records written.
+
+    Example:
+        >>> from forgery import records_to_file, seed
+        >>> seed(42)
+        >>> records_to_file(100, {"name": "name", "email": "email"}, "/tmp/test.csv")
+        100
+    """
+    return fake.records_to_file(
+        n,
+        schema,
+        path,
+        format=format,
+        chunk_size=chunk_size,
+        table=table,
+        on_progress=on_progress,
+    )
+
+
+def estimate_memory(n: int, schema: Schema) -> int:
+    """Estimate memory usage in bytes for generating n records.
+
+    Provides a rough estimate based on average field sizes. Useful for
+    deciding chunk_size for records_to_file().
+
+    Args:
+        n: Number of records.
+        schema: Dictionary mapping field names to type specifications.
+
+    Returns:
+        Estimated memory in bytes.
+
+    Example:
+        >>> from forgery import estimate_memory
+        >>> est = estimate_memory(1_000_000, {"name": "name", "email": "email"})
+        >>> est > 0
+        True
+    """
+    return Faker.estimate_memory(n, schema)
+
+
 # === Async Records Generation ===
 
 

diff --git a/python/forgery/__init__.pyi b/python/forgery/__init__.pyi
@@ -1,6 +1,6 @@
 """Type stubs for the forgery package."""
 
-from collections.abc import Coroutine
+from collections.abc import Callable, Coroutine
 from typing import Any
 
 from forgery._forgery import CreditCardFull as CreditCardFull
@@ -752,6 +752,52 @@ def records_sql(n: int, schema: Schema, table: str) -> str:
     """
     ...
 
+# Streaming file writer
+
+def records_to_file(
+    n: int,
+    schema: Schema,
+    path: str,
+    *,
+    format: str | None = None,
+    chunk_size: int | None = None,
+    table: str | None = None,
+    on_progress: Callable[[int, int], None] | None = None,
+) -> int:
+    """Generate records and stream them to a file in chunks.
+
+    Memory stays bounded by chunk_size regardless of total n.
+
+    Args:
+        n: Total number of records to generate.
+        schema: Dictionary mapping field names to type specifications.
+        path: Output file path.
+        format: Output format ("csv", "ndjson", "sql", "parquet") or None.
+        chunk_size: Records per chunk (default: 1,000,000).
+        table: Table name (required for SQL format).
+        on_progress: Optional callback(records_written, total).
+
+    Returns:
+        The total number of records written.
+
+    Raises:
+        ValueError: If schema is invalid or format is unsupported.
+        OSError: If file cannot be created.
+    """
+    ...
+
+def estimate_memory(n: int, schema: Schema) -> int:
+    """Estimate memory usage in bytes for generating n records.
+
+    Args:
+        n: Number of records.
+        schema: Dictionary mapping field names to type specifications.
+
+    Returns:
+        Estimated memory in bytes.
+    """
+    ...
+
 # Async Records generation
 
 def records_async(

diff --git a/python/forgery/_forgery.pyi b/python/forgery/_forgery.pyi
@@ -1,7 +1,7 @@
 """Type stubs for the Rust extension module."""
 
 import builtins
-from collections.abc import Coroutine
+from collections.abc import Callable, Coroutine
 from typing import Any, TypedDict
 
 class CreditCardFull(TypedDict):
@@ -1095,6 +1095,75 @@ class Faker:
         """
         ...
 
+    # Serialized output formats
+    def records_csv(self, n: int, schema: Schema) -> str:
+        """Generate records as a CSV string with header row."""
+        ...
+
+    def records_json(self, n: int, schema: Schema) -> str:
+        """Generate records as a JSON array string."""
+        ...
+
+    def records_ndjson(self, n: int, schema: Schema) -> str:
+        """Generate records as newline-delimited JSON."""
+        ...
+
+    def records_parquet(self, n: int, schema: Schema) -> bytes:
+        """Generate records as Parquet file bytes."""
+        ...
+
+    def records_sql(self, n: int, schema: Schema, table: str) -> str:
+        """Generate records as SQL INSERT statements."""
+        ...
+
+    # Streaming file writer
+    def records_to_file(
+        self,
+        n: int,
+        schema: Schema,
+        path: str,
+        format: str | None = None,
+        chunk_size: int | None = None,
+        table: str | None = None,
+        on_progress: Callable[[int, int], None] | None = None,
+    ) -> int:
+        """Generate records and stream them to a file in chunks.
+
+        Memory stays bounded by chunk_size regardless of total n.
+        Supports: csv, ndjson, sql, parquet (auto-detected from file extension).
+
+        Args:
+            n: Total number of records to generate.
+            schema: Dictionary mapping field names to type specifications.
+            path: Output file path.
+            format: Output format or None for auto-detect from extension.
+            chunk_size: Records per chunk (default: 1,000,000).
+            table: Table name (required for SQL format, ignored otherwise).
+            on_progress: Optional callback(records_written, total) after each chunk.
+
+        Returns:
+            The total number of records written.
+
+        Raises:
+            ValueError: If schema is invalid, format is unsupported, or SQL
+                        format is used without a table name.
+            OSError: If the file cannot be created or written.
+        """
+        ...
+
+    @staticmethod
+    def estimate_memory(n: int, schema: Schema) -> int:
+        """Estimate memory usage in bytes for generating n records.
+
+        Args:
+            n: Number of records.
+            schema: Schema dictionary.
+
+        Returns:
+            Estimated memory in bytes.
+        """
+        ...
+
     # Async records generators
     def records_async(
         self, n: int, schema: Schema, chunk_size: int | None = None