PartitionKey; hive partition path; transform key

jqin61 · jqin61 · commit 7a33b2328807 · 2024-02-16T19:19:01.000Z
diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
@@ -16,7 +16,9 @@
 # under the License.
 from __future__ import annotations
 
-from functools import cached_property
+from dataclasses import dataclass
+from datetime import date, datetime
+from functools import cached_property, singledispatch
 from typing import (
     Any,
     Dict,
@@ -36,7 +38,8 @@
 from pyiceberg.schema import Schema
 from pyiceberg.transforms import Transform, parse_transform
 from pyiceberg.typedef import IcebergBaseModel, Record
-from pyiceberg.types import NestedField, StructType
+from pyiceberg.types import DateType, IcebergType, NestedField, StructType, TimestampType, TimestamptzType
+from pyiceberg.utils.datetime import date_to_days, datetime_to_micros
 
 INITIAL_PARTITION_SPEC_ID = 0
 PARTITION_FIELD_ID_START: int = 1000
@@ -97,7 +100,6 @@ class PartitionSpec(IcebergBaseModel):
 
     spec_id: int = Field(alias="spec-id", default=INITIAL_PARTITION_SPEC_ID)
     fields: Tuple[PartitionField, ...] = Field(default_factory=tuple)
-    schema: Schema
 
     def __init__(
         self,
@@ -205,7 +207,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
             value = getattr(data, field_name)
 
             partition_field = self.fields[pos]  # partition field
-            value_str = partition_field.transform.to_human_string(source_type=field_types[pos].field_type, value=value)
+            value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=value)
             value_strs.append(value_str)
             field_strs.append(partition_field.name)
             pos += 1
@@ -234,3 +236,53 @@ def assign_fresh_partition_spec_ids(spec: PartitionSpec, old_schema: Schema, fre
             )
         )
     return PartitionSpec(*partition_fields, spec_id=INITIAL_PARTITION_SPEC_ID)
+
+
+@dataclass(frozen=True)
+class PartitionFieldValue:
+    # It seems partition fields could not be nested or have map, list structure
+    # So instead of using an accessor which was built through schema-visitor (like iceberg-spark does) to fetch the partition value,
+    # created this simple class for the first iteration.
+    # Open to discussion and willing to change to conform to row accessors.
+    source_id: int
+    value: Any
+
+
+@dataclass(frozen=True)
+class PartitionKey:
+    raw_partition_field_values: list[PartitionFieldValue]
+    partition_spec: PartitionSpec
+    schema: Schema
+    from functools import cached_property
+
+    @cached_property
+    def partition(self) -> Record:  # partition key in iceberg type
+        iceberg_typed_key_values = {}
+        for raw_partition_field_value in self.raw_partition_field_values:
+            partition_fields = self.partition_spec.source_id_to_fields_map[raw_partition_field_value.source_id]
+            assert len(partition_fields) == 1
+            partition_field = partition_fields[0]
+            iceberg_type = self.schema.find_field(name_or_id=raw_partition_field_value.source_id).field_type
+            _iceberg_typed_value = iceberg_typed_value(iceberg_type, raw_partition_field_value.value)
+            transformed_value = partition_field.transform.transform(iceberg_type)(_iceberg_typed_value)
+            iceberg_typed_key_values[partition_field.name] = transformed_value
+        return Record(**iceberg_typed_key_values)
+
+    def to_path(self) -> str:
+        return self.partition_spec.partition_to_path(self.partition, self.schema)
+
+
+@singledispatch
+def iceberg_typed_value(type: IcebergType, value: Any) -> Any:
+    return value
+
+
+@iceberg_typed_value.register(TimestampType)
+@iceberg_typed_value.register(TimestamptzType)
+def _(type: IcebergType, value: Optional[datetime]) -> Optional[int]:
+    return datetime_to_micros(value) if value is not None else None
+
+
+@iceberg_typed_value.register(DateType)
+def _(type: IcebergType, value: Optional[date]) -> Optional[int]:
+    return date_to_days(value) if value is not None else None
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -22,7 +22,7 @@
 from abc import ABC, abstractmethod
 from copy import copy
 from dataclasses import dataclass
-from datetime import date, datetime
+from datetime import datetime
 from enum import Enum
 from functools import cached_property, singledispatch
 from itertools import chain
@@ -67,7 +67,7 @@
     write_manifest,
     write_manifest_list,
 )
-from pyiceberg.partitioning import PartitionSpec
+from pyiceberg.partitioning import PartitionFieldValue, PartitionKey, PartitionSpec
 from pyiceberg.schema import (
     PartnerAccessor,
     Schema,
@@ -107,7 +107,6 @@
     Identifier,
     KeyDefaultDict,
     Properties,
-    Record,
 )
 from pyiceberg.types import (
     IcebergType,
@@ -118,7 +117,7 @@
     StructType,
 )
 from pyiceberg.utils.concurrent import ExecutorFactory
-from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, datetime_to_millis
+from pyiceberg.utils.datetime import datetime_to_millis
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -2257,7 +2256,7 @@ class WriteTask:
     def generate_data_file_partition_path(self) -> str:
         if self.partition_key is None:
             raise ValueError("Cannot generate partition path based on non-partitioned WriteTask")
-        return self.partition_key.to_path(self.schema)
+        return self.partition_key.to_path()
 
     def generate_data_file_filename(self, extension: str) -> str:
         # Mimics the behavior in the Java API:
@@ -2467,41 +2466,6 @@ class TablePartition:
     arrow_table_partition: pa.Table
 
 
-@dataclass(frozen=True)
-class PartitionKey:
-    raw_partition_key: Record  # partition key in raw python type
-    partition_spec: PartitionSpec
-
-    # this only supports identity transform now
-    @property
-    def partition(self) -> Record:  # partition key in iceberg type
-        iceberg_typed_key_values = {
-            field_name: iceberg_typed_value(getattr(self.raw_partition_key, field_name, None))
-            for field_name in self.raw_partition_key._position_to_field_name
-        }
-
-        return Record(**iceberg_typed_key_values)
-
-    def to_path(self, schema: Schema) -> str:
-        return self.partition_spec.partition_to_path(self.partition, schema)
-
-
-@singledispatch
-def iceberg_typed_value(value: Any) -> Any:
-    return value
-
-
-@iceberg_typed_value.register(datetime)
-def _(value: Any) -> int:
-    val = datetime_to_micros(value)
-    return val
-
-
-@iceberg_typed_value.register(date)
-def _(value: Any) -> int:
-    return date_to_days(value)
-
-
 def _get_partition_sort_order(partition_columns: list[str], reverse: bool = False) -> dict[str, Any]:
     order = 'ascending' if not reverse else 'descending'
     null_placement = 'at_start' if reverse else 'at_end'
@@ -2538,15 +2502,35 @@ def _get_partition_columns(iceberg_table: Table, arrow_table: pa.Table) -> list[
     return partition_cols
 
 
-def _get_partition_key(
-    arrow_table: pa.Table, partition_columns: list[str], offset: int, partition_spec: PartitionSpec
-) -> PartitionKey:
-    # todo: Instead of fetching partition keys one at a time, try filtering by a mask made of offsets, and convert to py together,
-    # possibly slightly more efficient.
-    return PartitionKey(
-        raw_partition_key=Record(**{col: arrow_table.column(col)[offset].as_py() for col in partition_columns}),
-        partition_spec=partition_spec,
-    )
+def _get_table_partitions(
+    arrow_table: pa.Table,
+    partition_spec: PartitionSpec,
+    schema: Schema,
+    slice_instructions: list[dict[str, Any]],
+) -> list[TablePartition]:
+    sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x['offset'])
+
+    partition_fields = partition_spec.fields
+
+    offsets = [inst["offset"] for inst in sorted_slice_instructions]
+    projected_and_filtered = {
+        partition_field.source_id: arrow_table[schema.find_field(name_or_id=partition_field.source_id).name]
+        .take(offsets)
+        .to_pylist()
+        for partition_field in partition_fields
+    }
+
+    table_partitions = []
+    for inst in sorted_slice_instructions:
+        partition_slice = arrow_table.slice(**inst)
+        fieldvalues = [
+            PartitionFieldValue(partition_field.source_id, projected_and_filtered[partition_field.source_id][inst["offset"]])
+            for partition_field in partition_fields
+        ]
+        partition_key = PartitionKey(raw_partition_field_values=fieldvalues, partition_spec=partition_spec, schema=schema)
+        table_partitions.append(TablePartition(partition_key=partition_key, arrow_table_partition=partition_slice))
+
+    return table_partitions
 
 
 def _partition(iceberg_table: Table, arrow_table: pa.Table) -> Iterable[TablePartition]:
@@ -2584,7 +2568,7 @@ def _partition(iceberg_table: Table, arrow_table: pa.Table) -> Iterable[TablePar
     reversing_sort_order_options = _get_partition_sort_order(partition_columns, reverse=True)
     reversed_indices = pa.compute.sort_indices(arrow_table, **reversing_sort_order_options).to_pylist()
 
-    slice_instructions = []
+    slice_instructions: list[dict[str, Any]] = []
     last = len(reversed_indices)
     reversed_indices_size = len(reversed_indices)
     ptr = 0
@@ -2595,13 +2579,10 @@ def _partition(iceberg_table: Table, arrow_table: pa.Table) -> Iterable[TablePar
         last = reversed_indices[ptr]
         ptr = ptr + group_size
 
-    table_partitions: list[TablePartition] = [
-        TablePartition(
-            partition_key=_get_partition_key(arrow_table, partition_columns, inst["offset"], iceberg_table.spec()),
-            arrow_table_partition=arrow_table.slice(**inst),
-        )
-        for inst in slice_instructions
-    ]
+    table_partitions: list[TablePartition] = _get_table_partitions(
+        arrow_table, iceberg_table.spec(), iceberg_table.schema(), slice_instructions
+    )
+
     return table_partitions
 
 
diff --git a/tests/integration/test_partitioned_writes.py b/tests/integration/test_partitioned_writes.py
@@ -360,9 +360,6 @@ def test_query_filter_null_partitioned(spark: SparkSession, part_col: str, forma
     for col in TEST_DATA_WITH_NULL.keys():
         assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}"
 
-    spark.sql(f"select path from {identifier}.manifests").show(20, False)
-    spark.sql(f"select path from {identifier}.manifests").collect()
-
 
 @pytest.mark.integration
 @pytest.mark.parametrize(
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -981,4 +981,4 @@ def test_correct_schema() -> None:
     with pytest.raises(ValueError) as exc_info:
         _ = t.scan(snapshot_id=-1).projection()
 
-    assert "Snapshot not found: -1" in str(exc_info.value)
+    assert "Snapshot not found: -1" in str(exc_info.value)