Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pyiceberg/io/fileformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from pyiceberg.partitioning import PartitionField, PartitionSpec, partition_record_value
from pyiceberg.schema import Schema
from pyiceberg.typedef import Properties, Record
from pyiceberg.types import NestedField

if TYPE_CHECKING:
import pyarrow as pa
Expand Down Expand Up @@ -161,6 +162,10 @@ def create_writer(
properties: Properties,
) -> FileFormatWriter: ...

@abstractmethod
def add_field_metadata(self, field: NestedField, metadata: dict[bytes, bytes], include_field_ids: bool) -> None:
"""Add format-specific Arrow field metadata."""


class FileFormatFactory:
"""Registry of FileFormatModel implementations."""
Expand Down
133 changes: 103 additions & 30 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from enum import Enum
from functools import lru_cache, singledispatch
from typing import (
IO,
TYPE_CHECKING,
Any,
Generic,
Expand Down Expand Up @@ -122,6 +123,7 @@
OutputStream,
)
from pyiceberg.io.fileformat import DataFileStatistics as DataFileStatistics
from pyiceberg.io.fileformat import FileFormatFactory, FileFormatModel, FileFormatWriter
from pyiceberg.manifest import (
DataFile,
DataFileContent,
Expand Down Expand Up @@ -1895,6 +1897,7 @@ def _to_requested_schema(
include_field_ids: bool = False,
projected_missing_fields: dict[int, Any] = EMPTY_DICT,
allow_timestamp_tz_mismatch: bool = False,
format_model: FileFormatModel | None = None,
) -> pa.RecordBatch:
# We could reuse some of these visitors
struct_array = visit_with_partner(
Expand All @@ -1906,6 +1909,7 @@ def _to_requested_schema(
include_field_ids,
projected_missing_fields=projected_missing_fields,
allow_timestamp_tz_mismatch=allow_timestamp_tz_mismatch,
format_model=format_model,
),
ArrowAccessor(file_schema),
)
Expand All @@ -1918,6 +1922,7 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None]
_downcast_ns_timestamp_to_us: bool
_projected_missing_fields: dict[int, Any]
_allow_timestamp_tz_mismatch: bool
_format_model: FileFormatModel | None

def __init__(
self,
Expand All @@ -1926,14 +1931,18 @@ def __init__(
include_field_ids: bool = False,
projected_missing_fields: dict[int, Any] = EMPTY_DICT,
allow_timestamp_tz_mismatch: bool = False,
format_model: FileFormatModel | None = None,
) -> None:
if include_field_ids and format_model is None:
raise ValueError("format_model is required when include_field_ids=True")
self._file_schema = file_schema
self._include_field_ids = include_field_ids
self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
self._projected_missing_fields = projected_missing_fields
# When True, allows projecting timestamptz (UTC) to timestamp (no tz).
# Allowed for reading (aligns with Spark); disallowed for writing to enforce Iceberg spec's strict typing.
self._allow_timestamp_tz_mismatch = allow_timestamp_tz_mismatch
self._format_model = format_model

def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
file_field = self._file_schema.find_field(field.field_id)
Expand Down Expand Up @@ -1988,13 +1997,11 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
return values

def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Field:
metadata = {}
metadata: dict[bytes, bytes] = {}
if field.doc:
metadata[PYARROW_FIELD_DOC_KEY] = field.doc
if self._include_field_ids:
# For projection visitor, we don't know the file format, so default to Parquet
# This is used for schema conversion during reads, not writes
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
metadata[PYARROW_FIELD_DOC_KEY] = field.doc.encode()
if self._format_model is not None:
self._format_model.add_field_metadata(field, metadata, self._include_field_ids)

return pa.field(
name=field.name,
Expand Down Expand Up @@ -2614,21 +2621,93 @@ def data_file_statistics_from_parquet_metadata(
)


class ParquetFormatWriter(FileFormatWriter):
"""Writes Arrow tables to a Parquet file."""

def __init__(self, output_file: OutputFile, file_schema: Schema, properties: Properties) -> None:
self._output_file = output_file
self._file_schema = file_schema
self._properties = properties
self._writer: pq.ParquetWriter | None = None
Comment thread
nssalian marked this conversation as resolved.
self._fos: OutputStream | None = None
self._parquet_writer_kwargs = _get_parquet_writer_kwargs(properties)
self._row_group_size = property_as_int(
properties=properties,
property_name=TableProperties.PARQUET_ROW_GROUP_LIMIT,
default=TableProperties.PARQUET_ROW_GROUP_LIMIT_DEFAULT,
)

def write(self, table: pa.Table) -> None:
if self._writer is None:
fos = self._output_file.create(overwrite=True)
try:
self._writer = pq.ParquetWriter(
cast(IO[Any], fos),
schema=table.schema,
store_decimal_as_integer=True,
**self._parquet_writer_kwargs,
)
except Exception:
fos.close()
raise
self._fos = fos
self._writer.write(table, row_group_size=self._row_group_size)

def close(self) -> DataFileStatistics:
Comment thread
nssalian marked this conversation as resolved.
if self._result is not None:
return self._result
if self._writer is None or self._fos is None:
raise ValueError("Cannot close a writer that was never written to")
with self._fos:
self._writer.close()
self._result = data_file_statistics_from_parquet_metadata(
parquet_metadata=self._writer.writer.metadata,
stats_columns=compute_statistics_plan(self._file_schema, self._properties),
parquet_column_mapping=parquet_path_to_id_mapping(self._file_schema),
)
return self._result


class ParquetFormatModel(FileFormatModel):
"""Format model for Apache Parquet."""

@property
def format(self) -> FileFormat:
return FileFormat.PARQUET

def file_extension(self) -> str:
return "parquet"

def create_writer(
self,
output_file: OutputFile,
file_schema: Schema,
properties: Properties,
) -> ParquetFormatWriter:
return ParquetFormatWriter(output_file, file_schema, properties)

def add_field_metadata(self, field: NestedField, metadata: dict[bytes, bytes], include_field_ids: bool) -> None:
if include_field_ids:
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id).encode()


FileFormatFactory.register(ParquetFormatModel())


def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, TableProperties

parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties)
row_group_size = property_as_int(
properties=table_metadata.properties,
property_name=TableProperties.PARQUET_ROW_GROUP_LIMIT,
default=TableProperties.PARQUET_ROW_GROUP_LIMIT_DEFAULT,
file_format = FileFormat(
table_metadata.properties.get(
TableProperties.WRITE_FILE_FORMAT,
TableProperties.WRITE_FILE_FORMAT_DEFAULT,
)
)
format_model = FileFormatFactory.get(file_format)
location_provider = load_location_provider(table_location=table_metadata.location, table_properties=table_metadata.properties)

def write_parquet(task: WriteTask) -> DataFile:
def write_data_file(task: WriteTask) -> DataFile:
table_schema = table_metadata.schema()
# if schema needs to be transformed, use the transformed schema and adjust the arrow table accordingly
# otherwise use the original schema
if (sanitized_schema := sanitize_column_names(table_schema)) != table_schema:
file_schema = sanitized_schema
else:
Expand All @@ -2642,29 +2721,25 @@ def write_parquet(task: WriteTask) -> DataFile:
batch=batch,
downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
include_field_ids=True,
format_model=format_model,
)
for batch in task.record_batches
]
arrow_table = pa.Table.from_batches(batches)
file_path = location_provider.new_data_location(
data_file_name=task.generate_data_file_filename("parquet"),
data_file_name=task.generate_data_file_filename(format_model.file_extension()),
partition_key=task.partition_key,
)
fo = io.new_output(file_path)
with fo.create(overwrite=True) as fos:
with pq.ParquetWriter(
fos, schema=arrow_table.schema, store_decimal_as_integer=True, **parquet_writer_kwargs
) as writer:
writer.write(arrow_table, row_group_size=row_group_size)
statistics = data_file_statistics_from_parquet_metadata(
parquet_metadata=writer.writer.metadata,
stats_columns=compute_statistics_plan(file_schema, table_metadata.properties),
parquet_column_mapping=parquet_path_to_id_mapping(file_schema),
)
data_file = DataFile.from_args(
writer = format_model.create_writer(fo, file_schema, table_metadata.properties)
with writer:
writer.write(arrow_table)
statistics = writer.result()

return DataFile.from_args(
content=DataFileContent.DATA,
file_path=file_path,
file_format=FileFormat.PARQUET,
file_format=file_format,
partition=task.partition_key.partition if task.partition_key else Record(),
file_size_in_bytes=len(fo),
# After this has been fixed:
Expand All @@ -2678,10 +2753,8 @@ def write_parquet(task: WriteTask) -> DataFile:
**statistics.to_serialized_dict(),
)

return data_file

executor = ExecutorFactory.get_or_create()
data_files = executor.map(write_parquet, tasks)
data_files = executor.map(write_data_file, tasks)

return iter(data_files)

Expand Down
14 changes: 14 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,20 @@ def table_schema_simple() -> Schema:
)


@pytest.fixture(scope="session")
def arrow_table_simple() -> "pa.Table":
"""Pyarrow table that pairs with `table_schema_simple` (3 rows, no nulls)."""
import pyarrow as pa

return pa.table(
{
"foo": ["a", "b", "c"],
"bar": pa.array([1, 2, 3], type=pa.int32()),
"baz": [True, False, True],
}
)


@pytest.fixture(scope="session")
def table_schema_with_full_nested_fields() -> Schema:
return Schema(
Expand Down
3 changes: 3 additions & 0 deletions tests/io/test_fileformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ def file_extension(self) -> str:
def create_writer(self, output_file: Any, file_schema: Any, properties: Any) -> Any:
raise NotImplementedError

def add_field_metadata(self, field: Any, metadata: Any, include_field_ids: bool) -> None:
pass

original = dict(FileFormatFactory._registry)
try:
model = _DummyModel()
Expand Down
Loading