From 8f5e2fa698233a876cb3795ab3a6b820e4e2e909 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 10 Nov 2025 05:14:55 +0000 Subject: [PATCH 01/11] refactor: major cleanup and simplification of protocols The protocols as well as implmentation hierarchy for all components have been greatly simplified. This commit leaves the package only partially functioning, with necessary rewrites pending for everything pertaining to pipeline and data caching. --- src/orcapod/__init__.py | 52 +- src/orcapod/contexts/__init__.py | 9 +- src/orcapod/contexts/core.py | 9 +- src/orcapod/contexts/registry.py | 8 +- src/orcapod/core/__init__.py | 2 +- src/orcapod/core/base.py | 105 +- src/orcapod/core/datagrams/__init__.py | 4 +- src/orcapod/core/datagrams/arrow_datagram.py | 81 +- .../core/datagrams/arrow_tag_packet.py | 200 +- src/orcapod/core/datagrams/base.py | 71 +- src/orcapod/core/datagrams/dict_datagram.py | 92 +- src/orcapod/core/datagrams/dict_tag_packet.py | 215 +- src/orcapod/core/executable_pod.py | 306 +++ src/orcapod/core/execution_engine.py | 22 + src/orcapod/core/function_pod.py | 706 +++++ src/orcapod/core/kernels.py | 241 -- src/orcapod/core/operators/__init__.py | 12 +- src/orcapod/core/operators/base.py | 297 +-- src/orcapod/core/operators/batch.py | 55 +- .../core/operators/column_selection.py | 125 +- src/orcapod/core/operators/filters.py | 70 +- src/orcapod/core/operators/join.py | 65 +- src/orcapod/core/operators/mappers.py | 64 +- src/orcapod/core/operators/semijoin.py | 84 +- src/orcapod/core/packet_function.py | 307 +++ src/orcapod/core/pods.py | 905 ------- src/orcapod/core/polars_data_utils.py | 5 +- src/orcapod/core/schema.py | 0 .../core/sources/arrow_table_source.py | 2 +- src/orcapod/core/sources/base.py | 35 +- src/orcapod/core/sources/data_frame_source.py | 2 +- src/orcapod/core/sources/dict_source.py | 2 +- src/orcapod/core/sources/list_source.py | 4 +- src/orcapod/core/streams/__init__.py | 13 +- src/orcapod/core/streams/base.py | 476 +--- src/orcapod/core/streams/cached_pod_stream.py | 461 ---- src/orcapod/core/streams/kernel_stream.py | 199 -- src/orcapod/core/streams/lazy_pod_stream.py | 232 -- src/orcapod/core/streams/pod_node_stream.py | 27 +- src/orcapod/core/streams/table_stream.py | 132 +- src/orcapod/core/streams/wrapped_stream.py | 86 - src/orcapod/core/{trackers.py => tracker.py} | 53 +- src/orcapod/hashing/arrow_hashers.py | 11 +- src/orcapod/pipeline/graph.py | 6 +- src/orcapod/pipeline/nodes.py | 13 +- .../protocols/core_protocols/__init__.py | 24 +- src/orcapod/protocols/core_protocols/base.py | 110 - .../protocols/core_protocols/datagrams.py | 602 ++--- .../protocols/core_protocols/function_pod.py | 33 + .../protocols/core_protocols/kernel.py | 201 -- .../protocols/core_protocols/labelable.py | 47 + .../protocols/core_protocols/operator_pod.py | 12 + .../core_protocols/packet_function.py | 140 + src/orcapod/protocols/core_protocols/pod.py | 147 ++ src/orcapod/protocols/core_protocols/pods.py | 228 -- .../{source.py => source_pod.py} | 4 +- .../protocols/core_protocols/streams.py | 376 +-- .../protocols/core_protocols/temporal.py | 24 + .../protocols/core_protocols/trackers.py | 77 +- src/orcapod/protocols/hashing_protocols.py | 17 +- .../protocols/legacy_data_protocols.py | 2278 ----------------- .../utils/{types_utils.py => schema_utils.py} | 15 +- .../test_datagrams/test_arrow_datagram.py | 2 +- .../test_datagrams/test_arrow_tag_packet.py | 2 +- .../test_datagrams/test_base_integration.py | 2 +- .../test_datagrams/test_dict_datagram.py | 2 +- .../test_datagrams/test_dict_tag_packet.py | 2 +- 67 files changed, 3060 insertions(+), 7151 deletions(-) create mode 100644 src/orcapod/core/executable_pod.py create mode 100644 src/orcapod/core/execution_engine.py create mode 100644 src/orcapod/core/function_pod.py delete mode 100644 src/orcapod/core/kernels.py create mode 100644 src/orcapod/core/packet_function.py delete mode 100644 src/orcapod/core/pods.py create mode 100644 src/orcapod/core/schema.py delete mode 100644 src/orcapod/core/streams/cached_pod_stream.py delete mode 100644 src/orcapod/core/streams/kernel_stream.py delete mode 100644 src/orcapod/core/streams/lazy_pod_stream.py delete mode 100644 src/orcapod/core/streams/wrapped_stream.py rename src/orcapod/core/{trackers.py => tracker.py} (89%) delete mode 100644 src/orcapod/protocols/core_protocols/base.py create mode 100644 src/orcapod/protocols/core_protocols/function_pod.py delete mode 100644 src/orcapod/protocols/core_protocols/kernel.py create mode 100644 src/orcapod/protocols/core_protocols/labelable.py create mode 100644 src/orcapod/protocols/core_protocols/operator_pod.py create mode 100644 src/orcapod/protocols/core_protocols/packet_function.py create mode 100644 src/orcapod/protocols/core_protocols/pod.py delete mode 100644 src/orcapod/protocols/core_protocols/pods.py rename src/orcapod/protocols/core_protocols/{source.py => source_pod.py} (91%) create mode 100644 src/orcapod/protocols/core_protocols/temporal.py delete mode 100644 src/orcapod/protocols/legacy_data_protocols.py rename src/orcapod/utils/{types_utils.py => schema_utils.py} (97%) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 226850e..0b8754d 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,29 +1,29 @@ -from .config import DEFAULT_CONFIG, Config -from .core import DEFAULT_TRACKER_MANAGER -from .core.pods import function_pod, FunctionPod, CachedPod -from .core import streams -from .core import operators -from .core import sources -from .core.sources import DataFrameSource -from . import databases -from .pipeline import Pipeline +# from .config import DEFAULT_CONFIG, Config +# from .core import DEFAULT_TRACKER_MANAGER +# from .core.packet_function import PythonPacketFunction +# from .core.function_pod import FunctionPod +# from .core import streams +# from .core import operators +# from .core import sources +# from .core.sources import DataFrameSource +# from . import databases +# from .pipeline import Pipeline +# no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking -no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking - -__all__ = [ - "DEFAULT_CONFIG", - "Config", - "DEFAULT_TRACKER_MANAGER", - "no_tracking", - "function_pod", - "FunctionPod", - "CachedPod", - "streams", - "databases", - "sources", - "DataFrameSource", - "operators", - "Pipeline", -] +# __all__ = [ +# "DEFAULT_CONFIG", +# "Config", +# "DEFAULT_TRACKER_MANAGER", +# "no_tracking", +# "function_pod", +# "FunctionPod", +# "CachedPod", +# "streams", +# "databases", +# "sources", +# "DataFrameSource", +# "operators", +# "Pipeline", +# ] diff --git a/src/orcapod/contexts/__init__.py b/src/orcapod/contexts/__init__.py index 116dbbb..48955f5 100644 --- a/src/orcapod/contexts/__init__.py +++ b/src/orcapod/contexts/__init__.py @@ -25,10 +25,13 @@ versions = get_available_contexts() """ -from .core import DataContext, ContextValidationError, ContextResolutionError -from .registry import JSONDataContextRegistry from typing import Any -from orcapod.protocols import hashing_protocols as hp, semantic_types_protocols as sp + +from orcapod.protocols import hashing_protocols as hp +from orcapod.protocols import semantic_types_protocols as sp + +from .core import ContextResolutionError, ContextValidationError, DataContext +from .registry import JSONDataContextRegistry # Global registry instance (lazily initialized) _registry: JSONDataContextRegistry | None = None diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py index f1b35d3..f0cf76d 100644 --- a/src/orcapod/contexts/core.py +++ b/src/orcapod/contexts/core.py @@ -7,7 +7,8 @@ from dataclasses import dataclass -from orcapod.protocols import hashing_protocols as hp, semantic_types_protocols as sp +from orcapod.protocols.hashing_protocols import ArrowHasher, ObjectHasher +from orcapod.protocols.semantic_types_protocols import TypeConverter @dataclass @@ -31,9 +32,9 @@ class DataContext: context_key: str version: str description: str - type_converter: sp.TypeConverter - arrow_hasher: hp.ArrowHasher - object_hasher: hp.ObjectHasher + type_converter: TypeConverter + arrow_hasher: ArrowHasher + object_hasher: ObjectHasher # this is the currently the JSON hasher class ContextValidationError(Exception): diff --git a/src/orcapod/contexts/registry.py b/src/orcapod/contexts/registry.py index e3f0989..4747422 100644 --- a/src/orcapod/contexts/registry.py +++ b/src/orcapod/contexts/registry.py @@ -6,13 +6,13 @@ """ import json - - +import logging from pathlib import Path from typing import Any -import logging + from orcapod.utils.object_spec import parse_objectspec -from .core import DataContext, ContextValidationError, ContextResolutionError + +from .core import ContextResolutionError, ContextValidationError, DataContext logger = logging.getLogger(__name__) diff --git a/src/orcapod/core/__init__.py b/src/orcapod/core/__init__.py index 24f5aab..1a84d7f 100644 --- a/src/orcapod/core/__init__.py +++ b/src/orcapod/core/__init__.py @@ -1,4 +1,4 @@ -from .trackers import DEFAULT_TRACKER_MANAGER +from .tracker import DEFAULT_TRACKER_MANAGER from .system_constants import constants __all__ = [ diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 828c371..cb8d8f5 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -1,38 +1,42 @@ import logging -from abc import ABC +from abc import ABC, abstractmethod +from datetime import datetime, timezone from typing import Any -from orcapod import DEFAULT_CONFIG, contexts -from orcapod.config import Config +import orcapod.contexts as contexts +from orcapod.config import DEFAULT_CONFIG, Config from orcapod.protocols import hashing_protocols as hp logger = logging.getLogger(__name__) -class LablableBase: +# Base classes for Orcapod core components, providing common functionality. + + +class LabelableMixin: def __init__(self, label: str | None = None, **kwargs): self._label = label super().__init__(**kwargs) @property - def has_assigned_label(self) -> bool: + def label(self) -> str: """ - Check if the label is explicitly set for this object. + Get the label of this object. Returns: - bool: True if the label is explicitly set, False otherwise. + str | None: The label of the object, or None if not set. """ - return self._label is not None + return self._label or self.computed_label() or self.__class__.__name__ @property - def label(self) -> str: + def has_assigned_label(self) -> bool: """ - Get the label of this object. + Check if the label is explicitly set for this object. Returns: - str | None: The label of the object, or None if not set. + bool: True if the label is explicitly set, False otherwise. """ - return self._label or self.computed_label() or self.__class__.__name__ + return self._label is not None @label.setter def label(self, label: str | None) -> None: @@ -52,7 +56,7 @@ def computed_label(self) -> str | None: return None -class ContextAwareConfigurableBase(ABC): +class DataContextMixin: def __init__( self, data_context: str | contexts.DataContext | None = None, @@ -73,13 +77,17 @@ def orcapod_config(self) -> Config: def data_context(self) -> contexts.DataContext: return self._data_context + @data_context.setter + def data_context(self, context: str | contexts.DataContext | None) -> None: + self._data_context = contexts.resolve_context(context) + @property def data_context_key(self) -> str: """Return the data context key.""" return self._data_context.context_key -class ContentIdentifiableBase(ContextAwareConfigurableBase): +class ContentIdentifiableBase(DataContextMixin, ABC): """ Base class for content-identifiable objects. This class provides a way to define objects that can be uniquely identified @@ -101,6 +109,7 @@ def __init__(self, **kwargs) -> None: self._cached_content_hash: hp.ContentHash | None = None self._cached_int_hash: int | None = None + @abstractmethod def identity_structure(self) -> Any: """ Return a structure that represents the identity of this object. @@ -112,7 +121,7 @@ def identity_structure(self) -> Any: Returns: Any: A structure representing this object's content, or None to use default hash """ - raise NotImplementedError("Subclasses must implement identity_structure") + ... def content_hash(self) -> hp.ContentHash: """ @@ -157,5 +166,67 @@ def __eq__(self, other: object) -> bool: return self.identity_structure() == other.identity_structure() -class LabeledContentIdentifiableBase(ContentIdentifiableBase, LablableBase): - pass +class TemporalMixin: + """ + Mixin class that adds temporal functionality to an Orcapod entity. + It provides methods to track and manage the last modified timestamp of the entity. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._modified_time = self._update_modified_time() + + @property + def last_modified(self) -> datetime | None: + """ + When this object's content was last modified. + + Returns: + datetime: Content last modified timestamp (timezone-aware) + None: Modification time unknown (assume always changed) + """ + return self._modified_time + + def _set_modified_time(self, modified_time: datetime | None) -> None: + """ + Set the modified time for this object. + + Args: + modified_time (datetime | None): The modified time to set. If None, clears the modified time. + """ + self._modified_time = modified_time + + def _update_modified_time(self) -> None: + """ + Update the modified time to the current time. + """ + self._modified_time = datetime.now(timezone.utc) + + def updated_since(self, timestamp: datetime) -> bool: + """ + Check if the object has been updated since the given timestamp. + + Args: + timestamp (datetime): The timestamp to compare against. + + Returns: + bool: True if the object has been updated since the given timestamp, False otherwise. + """ + # if _modified_time is None, consider it always updated + if self._modified_time is None: + return True + return self._modified_time > timestamp + + +class OrcapodBase(TemporalMixin, LabelableMixin, ContentIdentifiableBase): + """ + Base class for all default OrcaPod entities, providing common functionality + including data context awareness, content-based identity, (semantic) labeling, + and modification timestamp. + """ + + def __repr__(self): + return self.__class__.__name__ + + def __str__(self): + return self.label diff --git a/src/orcapod/core/datagrams/__init__.py b/src/orcapod/core/datagrams/__init__.py index 0c255e3..b20e776 100644 --- a/src/orcapod/core/datagrams/__init__.py +++ b/src/orcapod/core/datagrams/__init__.py @@ -1,7 +1,7 @@ from .arrow_datagram import ArrowDatagram -from .arrow_tag_packet import ArrowTag, ArrowPacket +from .arrow_tag_packet import ArrowPacket, ArrowTag from .dict_datagram import DictDatagram -from .dict_tag_packet import DictTag, DictPacket +from .dict_tag_packet import DictPacket, DictTag __all__ = [ "ArrowDatagram", diff --git a/src/orcapod/core/datagrams/arrow_datagram.py b/src/orcapod/core/datagrams/arrow_datagram.py index 9e5a7a5..b9fb7e8 100644 --- a/src/orcapod/core/datagrams/arrow_datagram.py +++ b/src/orcapod/core/datagrams/arrow_datagram.py @@ -1,13 +1,13 @@ import logging from collections.abc import Collection, Iterator, Mapping -from typing import Self, TYPE_CHECKING - +from typing import TYPE_CHECKING, Any, Self from orcapod import contexts from orcapod.core.datagrams.base import BaseDatagram from orcapod.core.system_constants import constants -from orcapod.types import DataValue, PythonSchema +from orcapod.protocols.core_protocols import ColumnConfig from orcapod.protocols.hashing_protocols import ContentHash +from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -57,6 +57,7 @@ def __init__( table: "pa.Table", meta_info: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, + **kwargs, ) -> None: """ Initialize ArrowDatagram from PyArrow Table. @@ -75,6 +76,8 @@ def __init__( The input table is automatically split into data, meta, and context components based on column naming conventions. """ + super().__init__() + # Validate table has exactly one row for datagram if len(table) != 1: raise ValueError( @@ -97,7 +100,7 @@ def __init__( data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] # Initialize base class with data context - super().__init__(data_context) + super().__init__(data_context=data_context, **kwargs) meta_columns = [ col for col in table.column_names if col.startswith(constants.META_PREFIX) @@ -185,14 +188,15 @@ def get(self, key: str, default: DataValue = None) -> DataValue: # 3. Structural Information def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: """Return tuple of column names.""" # Start with data columns - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context result_keys = list(self._data_table.column_names) @@ -215,11 +219,11 @@ def keys( return tuple(result_keys) - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """ Return Python schema for the datagram. @@ -234,8 +238,9 @@ def types( Returns: Python schema """ - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context # Get data schema (cached) if self._cached_python_schema is None: @@ -274,9 +279,9 @@ def types( def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -292,8 +297,9 @@ def arrow_schema( PyArrow schema representing the datagram's structure """ # order matters - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context all_schemas = [self._data_table.schema] @@ -344,9 +350,9 @@ def content_hash(self) -> ContentHash: # 4. Format Conversions (Export) def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Return dictionary representation of the datagram. @@ -361,8 +367,9 @@ def as_dict( Returns: Dictionary representation """ - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context # Get data dict (cached) if self._cached_python_dict is None: @@ -380,6 +387,7 @@ def as_dict( # Add meta data if requested if include_meta_columns and self._meta_table is not None: + meta_dict = None if include_meta_columns is True: meta_dict = self._meta_table.to_pylist()[0] elif isinstance(include_meta_columns, Collection): @@ -397,9 +405,9 @@ def as_dict( def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": """ Convert the datagram to an Arrow table. @@ -414,8 +422,9 @@ def as_table( Returns: Arrow table representation """ - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context all_tables = [self._data_table] @@ -455,9 +464,9 @@ def as_table( def as_arrow_compatible_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Return dictionary representation compatible with Arrow. @@ -472,11 +481,7 @@ def as_arrow_compatible_dict( Returns: Dictionary representation compatible with Arrow """ - return self.as_table( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ).to_pylist()[0] + return self.as_table(columns=columns, all_info=all_info).to_pylist()[0] # 5. Meta Column Operations def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: diff --git a/src/orcapod/core/datagrams/arrow_tag_packet.py b/src/orcapod/core/datagrams/arrow_tag_packet.py index 24d2185..e6d2cd1 100644 --- a/src/orcapod/core/datagrams/arrow_tag_packet.py +++ b/src/orcapod/core/datagrams/arrow_tag_packet.py @@ -1,16 +1,14 @@ import logging -from collections.abc import Collection, Mapping -from typing import Self, TYPE_CHECKING +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Self - -from orcapod.core.system_constants import constants from orcapod import contexts +from orcapod.core.datagrams.arrow_datagram import ArrowDatagram +from orcapod.core.system_constants import constants +from orcapod.protocols.core_protocols import ColumnConfig from orcapod.semantic_types import infer_python_schema_from_pylist_data - from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils - -from orcapod.core.datagrams.arrow_datagram import ArrowDatagram from orcapod.utils.lazy_module import LazyModule logger = logging.getLogger(__name__) @@ -76,43 +74,40 @@ def __init__( def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: keys = super().keys( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, + columns=columns, + all_info=all_info, ) - if include_all_info or include_system_tags: + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags: keys += tuple(self._system_tags_dict.keys()) return keys - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """Return copy of the Python schema.""" - schema = super().types( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, + schema = super().schema( + columns=columns, + all_info=all_info, ) - if include_all_info or include_system_tags: + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags: schema.update(self._system_tags_python_schema) return schema def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -125,11 +120,11 @@ def arrow_schema( PyArrow schema representing the datagram's structure """ schema = super().arrow_schema( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, + columns=columns, + all_info=all_info, ) - if include_all_info or include_system_tags: + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags: return arrow_utils.join_arrow_schemas( schema, self._system_tags_table.schema ) @@ -137,10 +132,9 @@ def arrow_schema( def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Convert to dictionary representation. @@ -152,47 +146,43 @@ def as_dict( Dictionary representation of the packet """ return_dict = super().as_dict( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, + columns=columns, + all_info=all_info, ) - if include_all_info or include_system_tags: + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags: return_dict.update(self._system_tags_dict) return return_dict def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": table = super().as_table( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, + columns=columns, + all_info=all_info, ) - if ( - include_all_info or include_system_tags - ) and self._system_tags_table.num_columns > 0: + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags and self._system_tags_table.num_columns > 0: # add system_tags only if there are actual system tag columns table = arrow_utils.hstack_tables(table, self._system_tags_table) return table def as_datagram( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> ArrowDatagram: table = self.as_table( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_system_tags=include_system_tags, + columns=columns, + all_info=all_info, ) return ArrowDatagram( table, - data_context=self._data_context, + data_context=self.data_context, ) def system_tags(self) -> dict[str, DataValue | None]: @@ -287,44 +277,41 @@ def __init__( def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: keys = super().keys( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, + columns=columns, + all_info=all_info, ) - if include_all_info or include_source: + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: keys += tuple(f"{constants.SOURCE_PREFIX}{k}" for k in self.keys()) return keys - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """Return copy of the Python schema.""" - schema = super().types( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, + schema = super().schema( + columns=columns, + all_info=all_info, ) - if include_all_info or include_source: + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: for key in self.keys(): schema[f"{constants.SOURCE_PREFIX}{key}"] = str return schema def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -336,12 +323,9 @@ def arrow_schema( Returns: PyArrow schema representing the datagram's structure """ - schema = super().arrow_schema( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + schema = super().arrow_schema(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: return arrow_utils.join_arrow_schemas( schema, self._source_info_table.schema ) @@ -349,10 +333,9 @@ def arrow_schema( def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Convert to dictionary representation. @@ -363,12 +346,9 @@ def as_dict( Returns: Dictionary representation of the packet """ - return_dict = super().as_dict( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + return_dict = super().as_dict(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: return_dict.update( { f"{constants.SOURCE_PREFIX}{k}": v @@ -379,17 +359,13 @@ def as_dict( def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": - table = super().as_table( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + table = super().as_table(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: # add source_info only if there are columns and the table has meaningful data if ( self._source_info_table.num_columns > 0 @@ -400,15 +376,11 @@ def as_table( def as_datagram( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> ArrowDatagram: - table = self.as_table( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_source=include_source, - ) + table = self.as_table(columns=columns, all_info=all_info) return ArrowDatagram( table, data_context=self._data_context, diff --git a/src/orcapod/core/datagrams/base.py b/src/orcapod/core/datagrams/base.py index ec68860..653f283 100644 --- a/src/orcapod/core/datagrams/base.py +++ b/src/orcapod/core/datagrams/base.py @@ -19,13 +19,12 @@ import logging from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping -from typing import Self, TypeAlias, TYPE_CHECKING -from orcapod import contexts -from orcapod.core.base import ContentIdentifiableBase -from orcapod.protocols.hashing_protocols import ContentHash +from typing import TYPE_CHECKING, Any, Self, TypeAlias -from orcapod.utils.lazy_module import LazyModule +from orcapod.core.base import ContentIdentifiableBase +from orcapod.protocols.core_protocols import ColumnConfig from orcapod.types import DataValue, PythonSchema +from orcapod.utils.lazy_module import LazyModule logger = logging.getLogger(__name__) @@ -119,22 +118,19 @@ class BaseDatagram(ContentIdentifiableBase): is interpreted and used is left to concrete implementations. """ - def __init__(self, data_context: contexts.DataContext | str | None = None) -> None: - """ - Initialize base datagram with data context. + # TODO: revisit handling of identity structure for datagrams + def identity_structure(self) -> Any: + raise NotImplementedError() - Args: - data_context: Context for semantic interpretation. Can be a string key - or a DataContext object, or None for default. + @property + def converter(self): """ - self._data_context = contexts.resolve_context(data_context) - self._converter = self._data_context.type_converter + Get the semantic type converter associated with this datagram's context. - # 1. Core Properties (Identity & Structure) - @property - def data_context_key(self) -> str: - """Return the data context key.""" - return self._data_context.context_key + Returns: + SemanticConverter: The type converter for this datagram's data context + """ + return self.data_context.type_converter @property @abstractmethod @@ -169,19 +165,19 @@ def get(self, key: str, default: DataValue = None) -> DataValue: @abstractmethod def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: """Return tuple of column names.""" ... @abstractmethod - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """Return type specification for the datagram.""" ... @@ -189,25 +185,20 @@ def types( @abstractmethod def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """Return the PyArrow schema for this datagram.""" ... - @abstractmethod - def content_hash(self) -> ContentHash: - """Calculate and return content hash of the datagram.""" - ... - # 4. Format Conversions (Export) @abstractmethod def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """Return dictionary representation of the datagram.""" ... @@ -215,9 +206,9 @@ def as_dict( @abstractmethod def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": """Convert the datagram to an Arrow table.""" ... @@ -272,7 +263,7 @@ def with_columns( def with_context_key(self, new_context_key: str) -> Self: """Create new datagram with different data context.""" new_datagram = self.copy(include_cache=False) - new_datagram._data_context = contexts.resolve_context(new_context_key) + new_datagram.data_context = new_context_key return new_datagram # 8. Utility Operations diff --git a/src/orcapod/core/datagrams/dict_datagram.py b/src/orcapod/core/datagrams/dict_datagram.py index 642a5b2..c46860e 100644 --- a/src/orcapod/core/datagrams/dict_datagram.py +++ b/src/orcapod/core/datagrams/dict_datagram.py @@ -1,15 +1,16 @@ import logging from collections.abc import Collection, Iterator, Mapping -from typing import Self, cast, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Self, cast -from orcapod.utils.lazy_module import LazyModule -from orcapod.core.system_constants import constants from orcapod import contexts from orcapod.core.datagrams.base import BaseDatagram +from orcapod.core.system_constants import constants +from orcapod.protocols.core_protocols import ColumnConfig +from orcapod.protocols.hashing_protocols import ContentHash from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import arrow_utils -from orcapod.protocols.hashing_protocols import ContentHash +from orcapod.utils.lazy_module import LazyModule logger = logging.getLogger(__name__) @@ -96,7 +97,7 @@ def __init__( # Initialize base class with data context final_context = data_context or cast(str, extracted_context) - super().__init__(final_context) + super().__init__(data_context=final_context) # Store data and meta components separately (immutable) self._data = dict(data_columns) @@ -181,13 +182,14 @@ def get(self, key: str, default: DataValue = None) -> DataValue: # 3. Structural Information def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: """Return tuple of column names.""" - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context # Start with data columns result_keys = list(self._data.keys()) @@ -210,11 +212,11 @@ def keys( return tuple(result_keys) - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """ Return Python schema for the datagram. @@ -229,8 +231,9 @@ def types( Returns: Python schema """ - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context # Start with data schema schema = dict(self._data_python_schema) @@ -255,9 +258,9 @@ def types( def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -272,8 +275,9 @@ def arrow_schema( Returns: PyArrow schema representing the datagram's structure """ - include_meta_columns = include_all_info or include_meta_columns - include_context = include_all_info or include_context + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context # Build data schema (cached) if self._cached_data_arrow_schema is None: @@ -287,7 +291,7 @@ def arrow_schema( # Add context schema if requested if include_context: - context_schema = self._converter.python_schema_to_arrow_schema( + context_schema = self.converter.python_schema_to_arrow_schema( {constants.CONTEXT_KEY: str} ) all_schemas.append(context_schema) @@ -323,16 +327,16 @@ def content_hash(self) -> ContentHash: """ if self._cached_content_hash is None: self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_meta_columns=False, include_context=False), + self.as_table(columns={"meta": False, "context": False}), ) return self._cached_content_hash # 4. Format Conversions (Export) def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Return dictionary representation of the datagram. @@ -347,8 +351,9 @@ def as_dict( Returns: Dictionary representation """ - include_context = include_all_info or include_context - include_meta_columns = include_all_info or include_meta_columns + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context result_dict = dict(self._data) # Start with user data @@ -374,9 +379,9 @@ def as_dict( def as_arrow_compatible_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Return dictionary representation compatible with Arrow. @@ -392,16 +397,8 @@ def as_arrow_compatible_dict( Dictionary representation compatible with Arrow """ # FIXME: this is a super inefficient implementation! - python_dict = self.as_dict( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - python_schema = self.types( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) + python_dict = self.as_dict(columns=columns, all_info=all_info) + python_schema = self.schema(columns=columns, all_info=all_info) return self._data_context.type_converter.python_dicts_to_struct_dicts( [python_dict], python_schema=python_schema @@ -434,9 +431,9 @@ def _get_meta_arrow_schema(self) -> "pa.Schema": def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": """ Convert the datagram to an Arrow table. @@ -451,8 +448,9 @@ def as_table( Returns: Arrow table representation """ - include_context = include_all_info or include_context - include_meta_columns = include_all_info or include_meta_columns + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + include_meta_columns = column_config.meta + include_context = column_config.context # Build data table (cached) if self._cached_data_table is None: @@ -750,7 +748,7 @@ def with_columns( new_data.update(updates) # Create updated python schema - handle None values by defaulting to str - python_schema = self.types() + python_schema = self.schema() if column_types is not None: python_schema.update(column_types) diff --git a/src/orcapod/core/datagrams/dict_tag_packet.py b/src/orcapod/core/datagrams/dict_tag_packet.py index 11e6d66..1b20b59 100644 --- a/src/orcapod/core/datagrams/dict_tag_packet.py +++ b/src/orcapod/core/datagrams/dict_tag_packet.py @@ -1,14 +1,14 @@ import logging -from collections.abc import Collection, Mapping -from typing import Self, TYPE_CHECKING +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Self - -from orcapod.core.system_constants import constants from orcapod import contexts from orcapod.core.datagrams.dict_datagram import DictDatagram -from orcapod.utils import arrow_utils +from orcapod.core.system_constants import constants +from orcapod.protocols.core_protocols import ColumnConfig from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema, PythonSchemaLike +from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: @@ -73,19 +73,15 @@ def _get_total_dict(self) -> dict[str, DataValue]: def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": """Convert the packet to an Arrow table.""" - table = super().as_table( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) + table = super().as_table(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) - if include_all_info or include_system_tags: + if column_config.system_tags: # Only create and stack system tags table if there are actually system tags if self._system_tags: # Check if system tags dict is not empty if self._cached_system_tags_table is None: @@ -100,10 +96,9 @@ def as_table( def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Return dictionary representation. @@ -114,55 +109,44 @@ def as_dict( Returns: Dictionary representation of the packet """ - dict_copy = super().as_dict( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_system_tags: + dict_copy = super().as_dict(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + + if column_config.system_tags: dict_copy.update(self._system_tags) return dict_copy def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: """Return keys of the Python schema.""" - keys = super().keys( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_system_tags: + keys = super().keys(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags: keys += tuple(self._system_tags.keys()) return keys - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """Return copy of the Python schema.""" - schema = super().types( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_system_tags: + schema = super().schema(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags: schema.update(self._system_tags_python_schema) return schema def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -174,12 +158,9 @@ def arrow_schema( Returns: PyArrow schema representing the datagram's structure """ - schema = super().arrow_schema( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_system_tags: + schema = super().arrow_schema(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.system_tags: if self._cached_system_tags_schema is None: self._cached_system_tags_schema = ( self._data_context.type_converter.python_schema_to_arrow_schema( @@ -193,9 +174,9 @@ def arrow_schema( def as_datagram( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_system_tags: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> DictDatagram: """ Convert the packet to a DictDatagram. @@ -207,16 +188,8 @@ def as_datagram( DictDatagram representation of the packet """ - data = self.as_dict( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_system_tags=include_system_tags, - ) - python_schema = self.types( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_system_tags=include_system_tags, - ) + data = self.as_dict(columns=columns, all_info=all_info) + python_schema = self.schema(columns=columns, all_info=all_info) return DictDatagram( data, python_schema=python_schema, @@ -299,7 +272,7 @@ def __init__( def _source_info_arrow_schema(self) -> "pa.Schema": if self._cached_source_info_schema is None: self._cached_source_info_schema = ( - self._converter.python_schema_to_arrow_schema( + self.converter.python_schema_to_arrow_schema( self._source_info_python_schema ) ) @@ -313,18 +286,14 @@ def _source_info_python_schema(self) -> dict[str, type]: def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": """Convert the packet to an Arrow table.""" - table = super().as_table( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + table = super().as_table(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: if self._cached_source_info_table is None: source_info_data = { f"{constants.SOURCE_PREFIX}{k}": v @@ -349,10 +318,9 @@ def as_table( def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Return dictionary representation. @@ -363,47 +331,36 @@ def as_dict( Returns: Dictionary representation of the packet """ - dict_copy = super().as_dict( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + dict_copy = super().as_dict(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: for key, value in self.source_info().items(): dict_copy[f"{constants.SOURCE_PREFIX}{key}"] = value return dict_copy def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: """Return keys of the Python schema.""" - keys = super().keys( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + keys = super().keys(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: keys += tuple(f"{constants.SOURCE_PREFIX}{key}" for key in super().keys()) return keys - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """Return copy of the Python schema.""" - schema = super().types( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + schema = super().schema(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: for key in self.keys(): schema[f"{constants.SOURCE_PREFIX}{key}"] = str return schema @@ -442,10 +399,9 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -457,12 +413,9 @@ def arrow_schema( Returns: PyArrow schema representing the datagram's structure """ - schema = super().arrow_schema( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_all_info or include_source: + schema = super().arrow_schema(columns=columns, all_info=all_info) + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + if column_config.source: return arrow_utils.join_arrow_schemas( schema, self._source_info_arrow_schema ) @@ -470,9 +423,9 @@ def arrow_schema( def as_datagram( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_source: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> DictDatagram: """ Convert the packet to a DictDatagram. @@ -484,18 +437,10 @@ def as_datagram( DictDatagram representation of the packet """ - data = self.as_dict( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_source=include_source, - ) - python_schema = self.types( - include_all_info=include_all_info, - include_meta_columns=include_meta_columns, - include_source=include_source, - ) + data = self.as_dict(columns=columns, all_info=all_info) + python_schema = self.schema(columns=columns, all_info=all_info) return DictDatagram( - data, + data=data, python_schema=python_schema, data_context=self._data_context, ) diff --git a/src/orcapod/core/executable_pod.py b/src/orcapod/core/executable_pod.py new file mode 100644 index 0000000..cdeab99 --- /dev/null +++ b/src/orcapod/core/executable_pod.py @@ -0,0 +1,306 @@ +import logging +from abc import abstractmethod +from collections.abc import Collection, Iterator +from datetime import datetime +from typing import TYPE_CHECKING, Any, cast + +from orcapod.core.base import OrcapodBase +from orcapod.core.streams.base import StreamBase +from orcapod.core.tracker import DEFAULT_TRACKER_MANAGER +from orcapod.protocols.core_protocols import ( + ArgumentGroup, + ColumnConfig, + Packet, + Pod, + Stream, + Tag, + TrackerManager, +) +from orcapod.types import PythonSchema +from orcapod.utils.lazy_module import LazyModule + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class ExecutablePod(OrcapodBase): + """ + Abstract Base class for all pods that requires execution to generate + static output stream. The output stream will reexecute the pod as necessary + to keep the output stream current. + """ + + def __init__(self, tracker_manager: TrackerManager | None = None, **kwargs) -> None: + self.tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + super().__init__(**kwargs) + + @property + def uri(self) -> tuple[str, ...]: + """ + Returns a unique resource identifier for the pod. + The pod URI must uniquely determine the necessary schema for the pod's information + """ + return ( + f"{self.__class__.__name__}", + self.content_hash().to_hex(), + ) + + @abstractmethod + def validate_inputs(self, *streams: Stream) -> None: + """ + Validate input streams, raising exceptions if invalid. + + Should check: + - Number of input streams + - Stream types and schemas + - Kernel-specific requirements + - Business logic constraints + + Args: + *streams: Input streams to validate + + Raises: + PodInputValidationError: If inputs are invalid + """ + ... + + @abstractmethod + def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: + """ + Describe symmetry/ordering constraints on input arguments. + + Returns a structure encoding which arguments can be reordered: + - SymmetricGroup (frozenset): Arguments commute (order doesn't matter) + - OrderedGroup (tuple): Arguments have fixed positions + - Nesting expresses partial symmetry + + Examples: + Full symmetry (Join): + return frozenset([a, b, c]) + + No symmetry (Concatenate): + return (a, b, c) + + Partial symmetry: + return (frozenset([a, b]), c) + # a,b are interchangeable, c has fixed position + """ + ... + + @abstractmethod + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + """ + Determine output types without triggering computation. + + This method performs type inference based on input stream types, + enabling efficient type checking and stream property queries. + It should be fast and not trigger any expensive computation. + + Used for: + - Pre-execution type validation + - Query planning and optimization + - Schema inference in complex pipelines + - IDE support and developer tooling + + Args: + *streams: Input streams to analyze + + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output + + Raises: + ValidationError: If input types are incompatible + TypeError: If stream types cannot be processed + """ + ... + + @abstractmethod + def execute(self, *streams: Stream) -> Stream: + """ + Executes the pod on the input streams, returning a new static output stream. + The output of execute is expected to be a static stream and thus only represent + instantaneous computation of the pod on the input streams. + + Concrete subclass implementing a Pod should override this method to provide + the pod's unique processing logic. + + Args: + *streams: Input streams to process + + Returns: + cp.Stream: The resulting output stream + """ + ... + + def process(self, *streams: Stream, label: str | None = None) -> Stream: + """ + Invoke the pod on a collection of streams, returning a KernelStream + that represents the computation. + + Args: + *streams: Input streams to process + + Returns: + cp.Stream: The resulting output stream + """ + logger.debug(f"Invoking kernel {self} on streams: {streams}") + + # perform input stream validation + self.validate_inputs(*streams) + self.tracker_manager.record_pod_invocation(self, upstreams=streams, label=label) + output_stream = ExecutablePodStream( + pod=self, + upstreams=streams, + ) + return output_stream + + def __call__(self, *streams: Stream, **kwargs) -> Stream: + """ + Convenience method to invoke the pod process on a collection of streams, + """ + logger.debug(f"Invoking pod {self} on streams through __call__: {streams}") + # perform input stream validation + return self.process(*streams, **kwargs) + + +class ExecutablePodStream(StreamBase): + """ + Recomputable stream wrapping a PodBase + + This stream is used to represent the output of a PodBase invocation. + + For a more general recomputable stream for Pod (orcapod.protocols.Pod), use + PodStream. + """ + + def __init__( + self, + pod: ExecutablePod, + upstreams: tuple[ + Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + self._pod = pod + self._upstreams = upstreams + + super().__init__(**kwargs) + self._set_modified_time(None) + self._cached_time: datetime | None = None + self._cached_stream: Stream | None = None + + @property + def source(self) -> Pod: + return self._pod + + @property + def upstreams(self) -> tuple[Stream, ...]: + return self._upstreams + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same pod. + """ + self._cached_stream = None + self._cached_time = None + + def keys( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + """ + tag_schema, packet_schema = self._pod.output_schema( + *self.upstreams, + columns=columns, + all_info=all_info, + ) + return tuple(tag_schema.keys()), tuple(packet_schema.keys()) + + def output_schema( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + """ + Returns the schemas of the tag and packet columns in the stream. + """ + return self._pod.output_schema( + *self.upstreams, + columns=columns, + all_info=all_info, + ) + + @property + def last_modified(self) -> datetime | None: + """Returns the last modified time of the stream.""" + self._update_cache_status() + return self._cached_time + + def _update_cache_status(self) -> None: + if self._cached_time is None: + return + + upstream_times = [stream.last_modified for stream in self.upstreams] + upstream_times.append(self._pod.last_modified) + + if any(t is None for t in upstream_times): + self._cached_results = None + self._cached_time = None + return + + # Get the maximum upstream time + max_upstream_time = max(cast(list[datetime], upstream_times)) + + # Invalidate cache if upstream is newer and update the cache time + if max_upstream_time > self._cached_time: + self._cached_results = None + self._cached_time = max_upstream_time + + def run(self, *args: Any, **kwargs: Any) -> None: + self._update_cache_status() + + # recompute if cache is invalid + if self._cached_time is None or self._cached_stream is None: + self._cached_stream = self._pod.execute( + *self.upstreams, + ) + self._cached_time = datetime.now() + + def as_table( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> "pa.Table": + self.run() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table(columns=columns, all_info=all_info) + + def iter_packets( + self, + ) -> Iterator[tuple[Tag, Packet]]: + self.run() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.iter_packets() + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" diff --git a/src/orcapod/core/execution_engine.py b/src/orcapod/core/execution_engine.py new file mode 100644 index 0000000..98a242c --- /dev/null +++ b/src/orcapod/core/execution_engine.py @@ -0,0 +1,22 @@ +from collections.abc import Callable +from typing import Any, Protocol, runtime_checkable + + +@runtime_checkable +class ExecutionEngine(Protocol): + @property + def name(self) -> str: ... + + def submit_sync(self, function: Callable, *args, **kwargs) -> Any: + """ + Run the given function with the provided arguments. + This method should be implemented by the execution engine. + """ + ... + + async def submit_async(self, function: Callable, *args, **kwargs) -> Any: + """ + Asynchronously run the given function with the provided arguments. + This method should be implemented by the execution engine. + """ + ... diff --git a/src/orcapod/core/function_pod.py b/src/orcapod/core/function_pod.py new file mode 100644 index 0000000..9da0829 --- /dev/null +++ b/src/orcapod/core/function_pod.py @@ -0,0 +1,706 @@ +import logging +from collections.abc import Callable, Collection, Iterator +from typing import TYPE_CHECKING, Any, Protocol, cast + +from orcapod import contexts +from orcapod.core.base import OrcapodBase +from orcapod.core.operators import Join +from orcapod.core.packet_function import PythonPacketFunction +from orcapod.core.streams.base import StreamBase +from orcapod.core.tracker import DEFAULT_TRACKER_MANAGER +from orcapod.protocols.core_protocols import ( + ArgumentGroup, + ColumnConfig, + Packet, + PacketFunction, + Pod, + Stream, + Tag, + TrackerManager, +) +from orcapod.types import PythonSchema +from orcapod.utils import arrow_utils, schema_utils +from orcapod.utils.lazy_module import LazyModule + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class FunctionPod(OrcapodBase): + def __init__( + self, + packet_function: PacketFunction, + tracker_manager: TrackerManager | None = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + self.packet_function = packet_function + self._output_schema_hash = self.data_context.object_hasher.hash_object( + self.packet_function.output_packet_schema + ).to_string() + + def identity_structure(self) -> Any: + return self.packet_function + + @property + def uri(self) -> tuple[str, ...]: + return ( + self.packet_function.packet_function_type_id, + f"v{self.packet_function.major_version}", + self._output_schema_hash, + ) + + def multi_stream_handler(self) -> Pod: + return Join() + + def validate_inputs(self, *streams: Stream) -> None: + """ + Validate input streams, raising exceptions if invalid. + + Should check: + - Number of input streams + - Stream types and schemas + - Kernel-specific requirements + - Business logic constraints + + Args: + *streams: Input streams to validate + + Raises: + PodInputValidationError: If inputs are invalid + """ + if len(streams) != 1: + raise ValueError( + f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" + ) + input_stream = streams[0] + _, incoming_packet_types = input_stream.output_schema() + expected_packet_schema = self.packet_function.input_packet_schema + if not schema_utils.check_typespec_compatibility( + incoming_packet_types, expected_packet_schema + ): + # TODO: use custom exception type for better error handling + raise ValueError( + f"Incoming packet data type {incoming_packet_types} from {input_stream} is not compatible with expected input typespec {expected_packet_schema}" + ) + + def process( + self, *streams: Stream, label: str | None = None + ) -> "FunctionPodStream": + """ + Invoke the packet processor on the input stream. + If multiple streams are passed in, all streams are joined before processing. + + Args: + *streams: Input streams to process + + Returns: + cp.Stream: The resulting output stream + """ + logger.debug(f"Invoking kernel {self} on streams: {streams}") + + # handle multiple input streams + if len(streams) == 0: + raise ValueError("At least one input stream is required") + elif len(streams) > 1: + multi_stream_handler = self.multi_stream_handler() + joined_stream = multi_stream_handler.process(*streams) + streams = (joined_stream,) + input_stream = streams[0] + + # perform input stream validation + self.validate_inputs(*streams) + self.tracker_manager.record_packet_function_invocation( + self.packet_function, input_stream, label=label + ) + output_stream = FunctionPodStream( + function_pod=self, + input_stream=input_stream, + ) + return output_stream + + def __call__(self, *streams: Stream, **kwargs) -> "FunctionPodStream": + """ + Convenience method to invoke the pod process on a collection of streams, + """ + logger.debug(f"Invoking pod {self} on streams through __call__: {streams}") + # perform input stream validation + return self.process(*streams, **kwargs) + + def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: + return self.multi_stream_handler().argument_symmetry(streams) + + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema = self.multi_stream_handler().output_schema( + *streams, columns=columns, all_info=all_info + )[0] + # The output schema of the FunctionPod is determined by the packet function + # TODO: handle and extend to include additional columns + return tag_schema, self.packet_function.output_packet_schema + + +class FunctionPodStream(StreamBase): + """ + Recomputable stream wrapping a packet function. + """ + + def __init__( + self, function_pod: FunctionPod, input_stream: Stream, **kwargs + ) -> None: + self._function_pod = function_pod + self._input_stream = input_stream + super().__init__(**kwargs) + + # capture the iterator over the input stream + self._cached_input_iterator = input_stream.iter_packets() + self._update_modified_time() # update the modified time to AFTER we obtain the iterator + # note that the invocation of iter_packets on upstream likely triggeres the modified time + # to be updated on the usptream. Hence you want to set this stream's modified time after that. + + # Packet-level caching (for the output packets) + self._cached_output_packets: dict[int, tuple[Tag, Packet | None]] = {} + self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None + + def identity_structure(self): + return ( + self._function_pod, + self._input_stream, + ) + + @property + def source(self) -> Pod: + return self._function_pod + + @property + def upstreams(self) -> tuple[Stream, ...]: + return (self._input_stream,) + + def keys( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + tag_schema, packet_schema = self.output_schema( + columns=columns, all_info=all_info + ) + + return tuple(tag_schema.keys()), tuple(packet_schema.keys()) + + def output_schema( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema = self._input_stream.output_schema( + columns=columns, all_info=all_info + )[0] + packet_schema = self._function_pod.packet_function.output_packet_schema + return (tag_schema, packet_schema) + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + return self.iter_packets() + + def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: + if self._cached_input_iterator is not None: + for i, (tag, packet) in enumerate(self._cached_input_iterator): + if i in self._cached_output_packets: + # Use cached result + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + else: + # Process packet + output_packet = self._function_pod.packet_function.call(packet) + self._cached_output_packets[i] = (tag, output_packet) + if output_packet is not None: + # Update shared cache for future iterators (optimization) + yield tag, output_packet + + # Mark completion by releasing the iterator + self._cached_input_iterator = None + else: + # Yield from snapshot of complete cache + for i in range(len(self._cached_output_packets)): + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + + def as_table( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> "pa.Table": + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(): + if tag_schema is None: + tag_schema = tag.arrow_schema(all_info=True) + if packet_schema is None: + packet_schema = packet.arrow_schema(all_info=True) + # TODO: make use of arrow_compat dict + all_tags.append(tag.as_dict(all_info=True)) + all_packets.append(packet.as_dict(all_info=True)) + + # TODO: re-verify the implemetation of this conversion + converter = self.data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + struct_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + return self._cached_output_table + + # drop_columns = [] + # if not include_system_tags: + # # TODO: get system tags more effiicently + # drop_columns.extend( + # [ + # c + # for c in self._cached_output_table.column_names + # if c.startswith(constants.SYSTEM_TAG_PREFIX) + # ] + # ) + # if not include_source: + # drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + # if not include_data_context: + # drop_columns.append(constants.CONTEXT_KEY) + + # output_table = self._cached_output_table.drop(drop_columns) + + # # lazily prepare content hash column if requested + # if include_content_hash: + # if self._cached_content_hash_column is None: + # content_hashes = [] + # # TODO: verify that order will be preserved + # for tag, packet in self.iter_packets(): + # content_hashes.append(packet.content_hash().to_string()) + # self._cached_content_hash_column = pa.array( + # content_hashes, type=pa.large_string() + # ) + # assert self._cached_content_hash_column is not None, ( + # "_cached_content_hash_column should not be None here." + # ) + # hash_column_name = ( + # "_content_hash" + # if include_content_hash is True + # else include_content_hash + # ) + # output_table = output_table.append_column( + # hash_column_name, self._cached_content_hash_column + # ) + + # if sort_by_tags: + # # TODO: reimplement using polars natively + # output_table = ( + # pl.DataFrame(output_table) + # .sort(by=self.keys()[0], descending=False) + # .to_arrow() + # ) + # # output_table = output_table.sort_by( + # # [(column, "ascending") for column in self.keys()[0]] + # # ) + # return output_table + + +class CallableWithPod(Protocol): + @property + def pod(self) -> FunctionPod: + """ + Returns associated function pod + """ + ... + + +def function_pod( + output_keys: str | Collection[str] | None = None, + function_name: str | None = None, + version: str = "v0.0", + label: str | None = None, + **kwargs, +) -> Callable[..., CallableWithPod]: + """ + Decorator that attaches FunctionPod as pod attribute. + + Args: + output_keys: Keys for the function output(s) + function_name: Name of the function pod; if None, defaults to the function name + **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. + + Returns: + CallableWithPod: Decorated function with `pod` attribute holding the FunctionPod instance + """ + + def decorator(func: Callable) -> CallableWithPod: + if func.__name__ == "": + raise ValueError("Lambda functions cannot be used with function_pod") + + # Store the original function in the module for pickling purposes + # and make sure to change the name of the function + + packet_function = PythonPacketFunction( + func, + output_keys=output_keys, + function_name=function_name or func.__name__, + version=version, + label=label, + **kwargs, + ) + + # Create a simple typed function pod + pod = FunctionPod( + packet_function=packet_function, + ) + setattr(func, "pod", pod) + return cast(CallableWithPod, func) + + return decorator + + +class WrappedFunctionPod(FunctionPod): + """ + A wrapper for a function pod, allowing for additional functionality or modifications without changing the original pod. + This class is meant to serve as a base class for other pods that need to wrap existing pods. + Note that only the call logic is pass through to the wrapped pod, but the forward logic is not. + """ + + def __init__( + self, + function_pod: FunctionPod, + data_context: str | contexts.DataContext | None = None, + **kwargs, + ) -> None: + # if data_context is not explicitly given, use that of the contained pod + if data_context is None: + data_context = function_pod.data_context_key + super().__init__( + data_context=data_context, + **kwargs, + ) + self._function_pod = function_pod + + def computed_label(self) -> str | None: + return self._function_pod.label + + @property + def uri(self) -> tuple[str, ...]: + return self._function_pod.uri + + def validate_inputs(self, *streams: Stream) -> None: + self._function_pod.validate_inputs(*streams) + + def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: + return self._function_pod.argument_symmetry(streams) + + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + return self._function_pod.output_schema( + *streams, columns=columns, all_info=all_info + ) + + # TODO: reconsider whether to return FunctionPodStream here in the signature + def process(self, *streams: Stream, label: str | None = None) -> FunctionPodStream: + return self._function_pod.process(*streams, label=label) + + +# class CachedFunctionPod(WrappedFunctionPod): +# """ +# A pod that caches the results of the wrapped pod. +# This is useful for pods that are expensive to compute and can benefit from caching. +# """ + +# # name of the column in the tag store that contains the packet hash +# DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" + +# def __init__( +# self, +# pod: cp.Pod, +# result_database: ArrowDatabase, +# record_path_prefix: tuple[str, ...] = (), +# match_tier: str | None = None, +# retrieval_mode: Literal["latest", "most_specific"] = "latest", +# **kwargs, +# ): +# super().__init__(pod, **kwargs) +# self.record_path_prefix = record_path_prefix +# self.result_database = result_database +# self.match_tier = match_tier +# self.retrieval_mode = retrieval_mode +# self.mode: Literal["production", "development"] = "production" + +# def set_mode(self, mode: str) -> None: +# if mode not in ("production", "development"): +# raise ValueError(f"Invalid mode: {mode}") +# self.mode = mode + +# @property +# def version(self) -> str: +# return self.pod.version + +# @property +# def record_path(self) -> tuple[str, ...]: +# """ +# Return the path to the record in the result store. +# This is used to store the results of the pod. +# """ +# return self.record_path_prefix + self.reference + +# def call( +# self, +# tag: cp.Tag, +# packet: cp.Packet, +# record_id: str | None = None, +# execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine +# | None = None, +# skip_cache_lookup: bool = False, +# skip_cache_insert: bool = False, +# ) -> tuple[cp.Tag, cp.Packet | None]: +# # TODO: consider logic for overwriting existing records +# execution_engine_hash = execution_engine.name if execution_engine else "default" +# if record_id is None: +# record_id = self.get_record_id( +# packet, execution_engine_hash=execution_engine_hash +# ) +# output_packet = None +# if not skip_cache_lookup and self.mode == "production": +# print("Checking for cache...") +# output_packet = self.get_cached_output_for_packet(packet) +# if output_packet is not None: +# print(f"Cache hit for {packet}!") +# if output_packet is None: +# tag, output_packet = super().call( +# tag, packet, record_id=record_id, execution_engine=execution_engine +# ) +# if ( +# output_packet is not None +# and not skip_cache_insert +# and self.mode == "production" +# ): +# self.record_packet(packet, output_packet, record_id=record_id) + +# return tag, output_packet + +# async def async_call( +# self, +# tag: cp.Tag, +# packet: cp.Packet, +# record_id: str | None = None, +# execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine +# | None = None, +# skip_cache_lookup: bool = False, +# skip_cache_insert: bool = False, +# ) -> tuple[cp.Tag, cp.Packet | None]: +# # TODO: consider logic for overwriting existing records +# execution_engine_hash = execution_engine.name if execution_engine else "default" + +# if record_id is None: +# record_id = self.get_record_id( +# packet, execution_engine_hash=execution_engine_hash +# ) +# output_packet = None +# if not skip_cache_lookup: +# output_packet = self.get_cached_output_for_packet(packet) +# if output_packet is None: +# tag, output_packet = await super().async_call( +# tag, packet, record_id=record_id, execution_engine=execution_engine +# ) +# if output_packet is not None and not skip_cache_insert: +# self.record_packet( +# packet, +# output_packet, +# record_id=record_id, +# execution_engine=execution_engine, +# ) + +# return tag, output_packet + +# def forward(self, *streams: cp.Stream) -> cp.Stream: +# assert len(streams) == 1, "PodBase.forward expects exactly one input stream" +# return CachedPodStream(pod=self, input_stream=streams[0]) + +# def record_packet( +# self, +# input_packet: cp.Packet, +# output_packet: cp.Packet, +# record_id: str | None = None, +# execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine +# | None = None, +# skip_duplicates: bool = False, +# ) -> cp.Packet: +# """ +# Record the output packet against the input packet in the result store. +# """ +# data_table = output_packet.as_table(include_context=True, include_source=True) + +# for i, (k, v) in enumerate(self.tiered_pod_id.items()): +# # add the tiered pod ID to the data table +# data_table = data_table.add_column( +# i, +# f"{constants.POD_ID_PREFIX}{k}", +# pa.array([v], type=pa.large_string()), +# ) + +# # add the input packet hash as a column +# data_table = data_table.add_column( +# 0, +# constants.INPUT_PACKET_HASH, +# pa.array([str(input_packet.content_hash())], type=pa.large_string()), +# ) +# # add execution engine information +# execution_engine_hash = execution_engine.name if execution_engine else "default" +# data_table = data_table.append_column( +# constants.EXECUTION_ENGINE, +# pa.array([execution_engine_hash], type=pa.large_string()), +# ) + +# # add computation timestamp +# timestamp = datetime.now(timezone.utc) +# data_table = data_table.append_column( +# constants.POD_TIMESTAMP, +# pa.array([timestamp], type=pa.timestamp("us", tz="UTC")), +# ) + +# if record_id is None: +# record_id = self.get_record_id( +# input_packet, execution_engine_hash=execution_engine_hash +# ) + +# self.result_database.add_record( +# self.record_path, +# record_id, +# data_table, +# skip_duplicates=skip_duplicates, +# ) +# # if result_flag is None: +# # # TODO: do more specific error handling +# # raise ValueError( +# # f"Failed to record packet {input_packet} in result store {self.result_store}" +# # ) +# # # TODO: make store return retrieved table +# return output_packet + +# def get_cached_output_for_packet(self, input_packet: cp.Packet) -> cp.Packet | None: +# """ +# Retrieve the output packet from the result store based on the input packet. +# If more than one output packet is found, conflict resolution strategy +# will be applied. +# If the output packet is not found, return None. +# """ +# # result_table = self.result_store.get_record_by_id( +# # self.record_path, +# # self.get_entry_hash(input_packet), +# # ) + +# # get all records with matching the input packet hash +# # TODO: add match based on match_tier if specified +# constraints = {constants.INPUT_PACKET_HASH: str(input_packet.content_hash())} +# if self.match_tier is not None: +# constraints[f"{constants.POD_ID_PREFIX}{self.match_tier}"] = ( +# self.pod.tiered_pod_id[self.match_tier] +# ) + +# result_table = self.result_database.get_records_with_column_value( +# self.record_path, +# constraints, +# ) +# if result_table is None or result_table.num_rows == 0: +# return None + +# if result_table.num_rows > 1: +# logger.info( +# f"Performing conflict resolution for multiple records for {input_packet.content_hash().display_name()}" +# ) +# if self.retrieval_mode == "latest": +# result_table = result_table.sort_by( +# self.DATA_RETRIEVED_FLAG, ascending=False +# ).take([0]) +# elif self.retrieval_mode == "most_specific": +# # match by the most specific pod ID +# # trying next level if not found +# for k, v in reversed(self.tiered_pod_id.items()): +# search_result = result_table.filter( +# pc.field(f"{constants.POD_ID_PREFIX}{k}") == v +# ) +# if search_result.num_rows > 0: +# result_table = search_result.take([0]) +# break +# if result_table.num_rows > 1: +# logger.warning( +# f"No matching record found for {input_packet.content_hash().display_name()} with tiered pod ID {self.tiered_pod_id}" +# ) +# result_table = result_table.sort_by( +# self.DATA_RETRIEVED_FLAG, ascending=False +# ).take([0]) + +# else: +# raise ValueError( +# f"Unknown retrieval mode: {self.retrieval_mode}. Supported modes are 'latest' and 'most_specific'." +# ) + +# pod_id_columns = [ +# f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() +# ] +# result_table = result_table.drop_columns(pod_id_columns) +# result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + +# # note that data context will be loaded from the result store +# return ArrowPacket( +# result_table, +# meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, +# ) + +# def get_all_cached_outputs( +# self, include_system_columns: bool = False +# ) -> "pa.Table | None": +# """ +# Get all records from the result store for this pod. +# If include_system_columns is True, include system columns in the result. +# """ +# record_id_column = ( +# constants.PACKET_RECORD_ID if include_system_columns else None +# ) +# result_table = self.result_database.get_all_records( +# self.record_path, record_id_column=record_id_column +# ) +# if result_table is None or result_table.num_rows == 0: +# return None + +# if not include_system_columns: +# # remove input packet hash and tiered pod ID columns +# pod_id_columns = [ +# f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() +# ] +# result_table = result_table.drop_columns(pod_id_columns) +# result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + +# return result_table diff --git a/src/orcapod/core/kernels.py b/src/orcapod/core/kernels.py deleted file mode 100644 index 52e1f8c..0000000 --- a/src/orcapod/core/kernels.py +++ /dev/null @@ -1,241 +0,0 @@ -from abc import abstractmethod -from collections.abc import Collection -from datetime import datetime, timezone -from typing import Any -from orcapod.protocols import core_protocols as cp -import logging -from orcapod.core.streams import KernelStream -from orcapod.core.base import LabeledContentIdentifiableBase -from orcapod.core.trackers import DEFAULT_TRACKER_MANAGER -from orcapod.types import PythonSchema - -logger = logging.getLogger(__name__) - - -class TrackedKernelBase(LabeledContentIdentifiableBase): - """ - Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. - It is the base class for all computations and transformations that can be performed on a collection of streams - (including an empty collection). - A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input - and returns a new stream as output (note that output stream is always singular). - Each "invocation" of the kernel on a collection of streams is assigned a unique ID. - The corresponding invocation information is stored as Invocation object and attached to the output stream - for computational graph tracking. - """ - - def __init__( - self, - label: str | None = None, - skip_tracking: bool = False, - tracker_manager: cp.TrackerManager | None = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self._label = label - - self._skip_tracking = skip_tracking - self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER - self._last_modified = None - self._kernel_hash = None - self._set_modified_time() - - @property - def reference(self) -> tuple[str, ...]: - """ - Returns a unique identifier for the kernel. - This is used to identify the kernel in the computational graph. - """ - return ( - f"{self.__class__.__name__}", - self.content_hash().to_hex(), - ) - - @property - def last_modified(self) -> datetime | None: - """ - When the kernel was last modified. For most kernels, this is the timestamp - of the kernel creation. - """ - return self._last_modified - - # TODO: reconsider making this a public method - def _set_modified_time( - self, timestamp: datetime | None = None, invalidate: bool = False - ) -> None: - """ - Sets the last modified time of the kernel. - If `invalidate` is True, it resets the last modified time to None to indicate unstable state that'd signal downstream - to recompute when using the kernel. Othewrise, sets the last modified time to the current time or to the provided timestamp. - """ - if invalidate: - self._last_modified = None - return - - if timestamp is not None: - self._last_modified = timestamp - else: - self._last_modified = datetime.now(timezone.utc) - - @abstractmethod - def kernel_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Return the output types of the kernel given the input streams. - """ - ... - - def output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - processed_streams = self.pre_kernel_processing(*streams) - self.validate_inputs(*processed_streams) - return self.kernel_output_types( - *processed_streams, include_system_tags=include_system_tags - ) - - @abstractmethod - def kernel_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - """ - Identity structure for this kernel. Input stream(s), if present, have already been preprocessed - and validated. - """ - ... - - def identity_structure(self, streams: Collection[cp.Stream] | None = None) -> Any: - """ - Default implementation of identity_structure for the kernel only - concerns the kernel class and the streams if present. Subclasses of - Kernels should override this method to provide a more meaningful - representation of the kernel. Note that kernel must provide the notion - of identity under possibly two distinct contexts: - 1) identity of the kernel in itself when invoked without any stream - 2) identity of the specific invocation of the kernel with a collection of streams - While the latter technically corresponds to the identity of the invocation and not - the kernel, only kernel can provide meaningful information as to the uniqueness of - the invocation as only kernel would know if / how the input stream(s) alter the identity - of the invocation. For example, if the kernel corresponds to an commutative computation - and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the - equivalence of the two by returning the same identity structure for both invocations. - This can be achieved, for example, by returning a set over the streams instead of a tuple. - """ - if streams is not None: - streams = self.pre_kernel_processing(*streams) - self.validate_inputs(*streams) - return self.kernel_identity_structure(streams) - - @abstractmethod - def forward(self, *streams: cp.Stream) -> cp.Stream: - """ - Trigger the main computation of the kernel on a collection of streams. - This method is called when the kernel is invoked with a collection of streams. - Subclasses should override this method to provide the kernel with its unique behavior - """ - - def pre_kernel_processing(self, *streams: cp.Stream) -> tuple[cp.Stream, ...]: - """ - Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing - on the input streams before the main computation. This is useful if you need to modify the input streams - or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the - pre-processing step will be tracked outside of the computation in the kernel. - Default implementation is a no-op, returning the input streams unchanged. - """ - return streams - - @abstractmethod - def validate_inputs(self, *streams: cp.Stream) -> None: - """ - Validate the input streams before the main computation but after the pre-kernel processing - """ - ... - - def prepare_output_stream( - self, *streams: cp.Stream, label: str | None = None - ) -> KernelStream: - """ - Prepare the output stream for the kernel invocation. - This method is called after the main computation is performed. - It creates a KernelStream with the provided streams and label. - """ - return KernelStream(source=self, upstreams=streams, label=label) - - def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: - """ - Track the invocation of the kernel with the provided streams. - This is a convenience method that calls record_kernel_invocation. - """ - if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_kernel_invocation(self, streams, label=label) - - def __call__( - self, *streams: cp.Stream, label: str | None = None, **kwargs - ) -> KernelStream: - processed_streams = self.pre_kernel_processing(*streams) - self.validate_inputs(*processed_streams) - output_stream = self.prepare_output_stream(*processed_streams, label=label) - self.track_invocation(*processed_streams, label=label) - return output_stream - - def __repr__(self): - return self.__class__.__name__ - - def __str__(self): - if self._label is not None: - return f"{self.__class__.__name__}({self._label})" - return self.__class__.__name__ - - -class WrappedKernel(TrackedKernelBase): - """ - A wrapper for a kernels useful when you want to use an existing kernel - but need to provide some extra functionality. - - Default implementation provides a simple passthrough to the wrapped kernel. - If you want to provide a custom behavior, be sure to override the methods - that you want to change. Note that the wrapped kernel must implement the - `Kernel` protocol. Refer to `orcapod.protocols.data_protocols.Kernel` for more details. - """ - - def __init__(self, kernel: cp.Kernel, **kwargs) -> None: - # TODO: handle fixed input stream already set on the kernel - super().__init__(**kwargs) - self.kernel = kernel - - def computed_label(self) -> str | None: - """ - Compute a label for this kernel based on its content. - If label is not explicitly set for this kernel and computed_label returns a valid value, - it will be used as label of this kernel. - """ - return self.kernel.label - - @property - def reference(self) -> tuple[str, ...]: - return self.kernel.reference - - def kernel_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - return self.kernel.output_types( - *streams, include_system_tags=include_system_tags - ) - - def kernel_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - return self.kernel.identity_structure(streams) - - def validate_inputs(self, *streams: cp.Stream) -> None: - return self.kernel.validate_inputs(*streams) - - def forward(self, *streams: cp.Stream) -> cp.Stream: - return self.kernel.forward(*streams) - - def __repr__(self): - return f"WrappedKernel({self.kernel!r})" - - def __str__(self): - return f"WrappedKernel:{self.kernel!s}" diff --git a/src/orcapod/core/operators/__init__.py b/src/orcapod/core/operators/__init__.py index b1f0544..08ae586 100644 --- a/src/orcapod/core/operators/__init__.py +++ b/src/orcapod/core/operators/__init__.py @@ -1,14 +1,14 @@ -from .join import Join -from .semijoin import SemiJoin -from .mappers import MapTags, MapPackets from .batch import Batch from .column_selection import ( - SelectTagColumns, - SelectPacketColumns, - DropTagColumns, DropPacketColumns, + DropTagColumns, + SelectPacketColumns, + SelectTagColumns, ) from .filters import PolarsFilter +from .join import Join +from .mappers import MapPackets, MapTags +from .semijoin import SemiJoin __all__ = [ "Join", diff --git a/src/orcapod/core/operators/base.py b/src/orcapod/core/operators/base.py index b87748c..07b6ed2 100644 --- a/src/orcapod/core/operators/base.py +++ b/src/orcapod/core/operators/base.py @@ -1,85 +1,32 @@ -from orcapod.core.kernels import TrackedKernelBase -from orcapod.protocols import core_protocols as cp -from orcapod.types import PythonSchema from abc import abstractmethod -from typing import Any from collections.abc import Collection +from typing import Any + +from orcapod.core.executable_pod import ExecutablePod +from orcapod.protocols.core_protocols import ArgumentGroup, ColumnConfig, Stream +from orcapod.types import PythonSchema -class Operator(TrackedKernelBase): +class Operator(ExecutablePod): """ Base class for all operators. - Operators are a special type of kernel that can be used to perform operations on streams. + Operators are basic pods that can be used to perform operations on streams. They are defined as a callable that takes a (possibly empty) collection of streams as the input - and returns a new stream as output (note that output stream is always singular). + and returns a new stream as output. """ + def identity_structure(self) -> Any: + return self.__class__.__name__ + class UnaryOperator(Operator): """ - Base class for all operators. + Base class for all unary operators. """ - def check_unary_input( - self, - streams: Collection[cp.Stream], - ) -> None: - """ - Check that the inputs to the unary operator are valid. - """ - if len(streams) != 1: - raise ValueError("UnaryOperator requires exactly one input stream.") - - def validate_inputs(self, *streams: cp.Stream) -> None: - self.check_unary_input(streams) - stream = streams[0] - return self.op_validate_inputs(stream) - - def forward(self, *streams: cp.Stream) -> cp.Stream: - """ - Forward method for unary operators. - It expects exactly one stream as input. - """ - stream = streams[0] - return self.op_forward(stream) - - # TODO: complete substream implementation - # Substream implementation pending - # stream = streams[0] - # # visit each substream - # output_substreams = [] - # for substream_id in stream.substream_identities: - # substream = stream.get_substream(substream_id) - # output_substreams.append(self.op_forward(substream)) - - # # at the moment only single output substream is supported - # if len(output_substreams) != 1: - # raise NotImplementedError( - # "Support for multiple output substreams is not implemented yet." - # ) - # return output_substreams[0] - - def kernel_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - stream = streams[0] - return self.op_output_types(stream, include_system_tags=include_system_tags) - - def kernel_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - """ - Return a structure that represents the identity of this operator. - This is used to ensure that the operator can be uniquely identified in the computational graph. - """ - if streams is not None: - stream = list(streams)[0] - return self.op_identity_structure(stream) - return self.op_identity_structure() - @abstractmethod - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -87,16 +34,20 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: ... @abstractmethod - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: """ - This method should be implemented by subclasses to define the specific behavior of the binary operator. - It takes two streams as input and returns a new stream as output. + This method should be implemented by subclasses to define the specific behavior of the unary operator. + It takes one stream as input and returns a new stream as output. """ ... @abstractmethod - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. @@ -104,13 +55,32 @@ def op_output_types( """ ... - @abstractmethod - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def validate_inputs(self, *streams: Stream) -> None: + if len(streams) != 1: + raise ValueError("UnaryOperator requires exactly one input stream.") + stream = streams[0] + return self.validate_unary_input(stream) + + def execute(self, *streams: Stream) -> Stream: """ - This method should be implemented by subclasses to return a structure that represents the identity of the operator. - It takes two streams as input and returns a tuple containing the operator name and a set of streams. + Forward method for unary operators. + It expects exactly one stream as input. """ - ... + stream = streams[0] + return self.unary_execute(stream) + + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + stream = streams[0] + return self.unary_output_schema(stream, columns=columns, all_info=all_info) + + def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: + # return single stream as a tuple + return (tuple(streams)[0],) class BinaryOperator(Operator): @@ -118,92 +88,63 @@ class BinaryOperator(Operator): Base class for all operators. """ - def check_binary_inputs( - self, - streams: Collection[cp.Stream], - ) -> None: + @abstractmethod + def validate_binary_inputs(self, left_stream: Stream, right_stream: Stream) -> None: """ Check that the inputs to the binary operator are valid. This method is called before the forward method to ensure that the inputs are valid. """ - if len(streams) != 2: - raise ValueError("BinaryOperator requires exactly two input streams.") - - def validate_inputs(self, *streams: cp.Stream) -> None: - self.check_binary_inputs(streams) - left_stream, right_stream = streams - return self.op_validate_inputs(left_stream, right_stream) + ... - def forward(self, *streams: cp.Stream) -> cp.Stream: + @abstractmethod + def binary_execute(self, left_stream: Stream, right_stream: Stream) -> Stream: """ Forward method for binary operators. It expects exactly two streams as input. """ - left_stream, right_stream = streams - return self.op_forward(left_stream, right_stream) - - def kernel_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - left_stream, right_stream = streams - return self.op_output_types( - left_stream, right_stream, include_system_tags=include_system_tags - ) - - def kernel_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - """ - Return a structure that represents the identity of this operator. - This is used to ensure that the operator can be uniquely identified in the computational graph. - """ - if streams is not None: - left_stream, right_stream = streams - self.op_identity_structure(left_stream, right_stream) - return self.op_identity_structure() + ... @abstractmethod - def op_validate_inputs( - self, left_stream: cp.Stream, right_stream: cp.Stream - ) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - ... + def binary_output_schema( + self, + left_stream: Stream, + right_stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: ... @abstractmethod - def op_forward(self, left_stream: cp.Stream, right_stream: cp.Stream) -> cp.Stream: + def is_commutative(self) -> bool: """ - This method should be implemented by subclasses to define the specific behavior of the binary operator. - It takes two streams as input and returns a new stream as output. + Return True if the operator is commutative (i.e., order of inputs does not matter). """ ... - @abstractmethod - def op_output_types( + def output_schema( self, - left_stream: cp.Stream, - right_stream: cp.Stream, - include_system_tags: bool = False, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - """ - This method should be implemented by subclasses to return the typespecs of the input and output streams. - It takes two streams as input and returns a tuple of typespecs. - """ - ... + left_stream, right_stream = streams + return self.binary_output_schema( + left_stream, right_stream, columns=columns, all_info=all_info + ) - @abstractmethod - def op_identity_structure( - self, - left_stream: cp.Stream | None = None, - right_stream: cp.Stream | None = None, - ) -> Any: - """ - This method should be implemented by subclasses to return a structure that represents the identity of the operator. - It takes two streams as input and returns a tuple containing the operator name and a set of streams. - """ - ... + def validate_inputs(self, *streams: Stream) -> None: + if len(streams) != 2: + raise ValueError("BinaryOperator requires exactly two input streams.") + left_stream, right_stream = streams + self.validate_binary_inputs(left_stream, right_stream) + + def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: + if self.is_commutative(): + # return as symmetric group + return frozenset(streams) + else: + # return as ordered group + return tuple(streams) class NonZeroInputOperator(Operator): @@ -213,78 +154,20 @@ class NonZeroInputOperator(Operator): such as joins, unions, etc. """ - def verify_non_zero_input( + @abstractmethod + def validate_nonzero_inputs( self, - streams: Collection[cp.Stream], + *streams: Stream, ) -> None: """ Check that the inputs to the variable inputs operator are valid. This method is called before the forward method to ensure that the inputs are valid. """ + ... + + def validate_inputs(self, *streams: Stream) -> None: if len(streams) == 0: raise ValueError( f"Operator {self.__class__.__name__} requires at least one input stream." ) - - def validate_inputs(self, *streams: cp.Stream) -> None: - self.verify_non_zero_input(streams) - return self.op_validate_inputs(*streams) - - def forward(self, *streams: cp.Stream) -> cp.Stream: - """ - Forward method for variable inputs operators. - It expects at least one stream as input. - """ - return self.op_forward(*streams) - - def kernel_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - return self.op_output_types(*streams, include_system_tags=include_system_tags) - - def kernel_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - """ - Return a structure that represents the identity of this operator. - This is used to ensure that the operator can be uniquely identified in the computational graph. - """ - return self.op_identity_structure(streams) - - @abstractmethod - def op_validate_inputs(self, *streams: cp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - ... - - @abstractmethod - def op_forward(self, *streams: cp.Stream) -> cp.Stream: - """ - This method should be implemented by subclasses to define the specific behavior of the non-zero input operator. - It takes variable number of streams as input and returns a new stream as output. - """ - ... - - @abstractmethod - def op_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - This method should be implemented by subclasses to return the typespecs of the input and output streams. - It takes at least one stream as input and returns a tuple of typespecs. - """ - ... - - @abstractmethod - def op_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - """ - This method should be implemented by subclasses to return a structure that represents the identity of the operator. - It takes zero or more streams as input and returns a tuple containing the operator name and a set of streams. - If zero, it should return identity of the operator itself. - If one or more, it should return a identity structure approrpiate for the operator invoked on the given streams. - """ - ... + self.validate_nonzero_inputs(*streams) diff --git a/src/orcapod/core/operators/batch.py b/src/orcapod/core/operators/batch.py index be48b3c..83dc270 100644 --- a/src/orcapod/core/operators/batch.py +++ b/src/orcapod/core/operators/batch.py @@ -1,13 +1,13 @@ +from typing import TYPE_CHECKING, Any + from orcapod.core.operators.base import UnaryOperator -from collections.abc import Collection -from orcapod.protocols import core_protocols as cp -from typing import Any, TYPE_CHECKING -from orcapod.utils.lazy_module import LazyModule from orcapod.core.streams import TableStream +from orcapod.protocols.core_protocols import ColumnConfig, Stream +from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: - import pyarrow as pa import polars as pl + import pyarrow as pa else: pa = LazyModule("pyarrow") pl = LazyModule("polars") @@ -29,34 +29,18 @@ def __init__(self, batch_size: int = 0, drop_partial_batch: bool = False, **kwar self.batch_size = batch_size self.drop_partial_batch = drop_partial_batch - def check_unary_input( - self, - streams: Collection[cp.Stream], - ) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ - Check that the inputs to the unary operator are valid. - """ - if len(streams) != 1: - raise ValueError("UnaryOperator requires exactly one input stream.") - - def validate_inputs(self, *streams: cp.Stream) -> None: - self.check_unary_input(streams) - stream = streams[0] - return self.op_validate_inputs(stream) - - def op_validate_inputs(self, stream: cp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. + Batch works on any input stream, so no validation is needed. """ return None - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: """ This method should be implemented by subclasses to define the specific behavior of the binary operator. It takes two streams as input and returns a new stream as output. """ - table = stream.as_table(include_source=True, include_system_tags=True) + table = stream.as_table(columns={"source": True, "system_tags": True}) tag_columns, packet_columns = stream.keys() @@ -83,24 +67,25 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: batched_table = pa.Table.from_pylist(batched_data) return TableStream(batched_table, tag_columns=tag_columns) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes two streams as input and returns a tuple of typespecs. """ - tag_types, packet_types = stream.types(include_system_tags=include_system_tags) + tag_types, packet_types = stream.output_schema( + columns=columns, all_info=all_info + ) batched_tag_types = {k: list[v] for k, v in tag_types.items()} batched_packet_types = {k: list[v] for k, v in packet_types.items()} # TODO: check if this is really necessary return PythonSchema(batched_tag_types), PythonSchema(batched_packet_types) - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: - return ( - (self.__class__.__name__, self.batch_size, self.drop_partial_batch) - + (stream,) - if stream is not None - else () - ) + def identity_structure(self) -> Any: + return (self.__class__.__name__, self.batch_size, self.drop_partial_batch) diff --git a/src/orcapod/core/operators/column_selection.py b/src/orcapod/core/operators/column_selection.py index 4140db8..f37b8a4 100644 --- a/src/orcapod/core/operators/column_selection.py +++ b/src/orcapod/core/operators/column_selection.py @@ -1,14 +1,14 @@ -from orcapod.protocols import core_protocols as cp +import logging +from collections.abc import Collection, Mapping +from typing import TYPE_CHECKING, Any + +from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream +from orcapod.core.system_constants import constants +from orcapod.errors import InputValidationError +from orcapod.protocols.core_protocols import ColumnConfig, Stream from orcapod.types import PythonSchema -from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection, Mapping -from orcapod.errors import InputValidationError -from orcapod.core.system_constants import constants -from orcapod.core.operators.base import UnaryOperator -import logging - if TYPE_CHECKING: import pyarrow as pa @@ -30,7 +30,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() tags_to_drop = [c for c in tag_columns if c not in self.columns] new_tag_columns = [c for c in tag_columns if c not in tags_to_drop] @@ -40,7 +40,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: return stream table = stream.as_table( - include_source=True, include_system_tags=True, sort_by_tags=False + columns={"source": True, "system_tags": True, "sort_by_tags": False} ) modified_table = table.drop_columns(list(tags_to_drop)) @@ -52,7 +52,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: upstreams=(stream,), ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -66,11 +66,15 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: f"Missing tag columns: {missing_columns}. Make sure all specified columns to select are present or use strict=False to ignore missing columns" ) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_schema, packet_schema = stream.types( - include_system_tags=include_system_tags + tag_schema, packet_schema = stream.output_schema( + columns=columns, all_info=all_info ) tag_columns, _ = stream.keys() tags_to_drop = [tc for tc in tag_columns if tc not in self.columns] @@ -80,7 +84,7 @@ def op_output_types( return new_tag_schema, packet_schema - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def op_identity_structure(self, stream: Stream | None = None) -> Any: return ( self.__class__.__name__, self.columns, @@ -100,7 +104,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() packet_columns_to_drop = [c for c in packet_columns if c not in self.columns] new_packet_columns = [ @@ -112,7 +116,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: return stream table = stream.as_table( - include_source=True, include_system_tags=True, sort_by_tags=False + columns={"source": True, "system_tags": True, "sort_by_tags": False}, ) # make sure to drop associated source fields associated_source_fields = [ @@ -129,7 +133,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: upstreams=(stream,), ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -143,11 +147,15 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: f"Missing packet columns: {missing_columns}. Make sure all specified columns to select are present or use strict=False to ignore missing columns" ) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_schema, packet_schema = stream.types( - include_system_tags=include_system_tags + tag_schema, packet_schema = stream.output_schema( + columns=columns, all_info=all_info ) _, packet_columns = stream.keys() packets_to_drop = [pc for pc in packet_columns if pc not in self.columns] @@ -159,12 +167,12 @@ def op_output_types( return tag_schema, new_packet_schema - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.columns, self.strict, - ) + ((stream,) if stream is not None else ()) + ) class DropTagColumns(UnaryOperator): @@ -179,7 +187,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() columns_to_drop = self.columns if not self.strict: @@ -192,7 +200,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: return stream table = stream.as_table( - include_source=True, include_system_tags=True, sort_by_tags=False + columns={"source": True, "system_tags": True, "sort_by_tags": False} ) modified_table = table.drop_columns(list(columns_to_drop)) @@ -204,7 +212,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: upstreams=(stream,), ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -218,11 +226,15 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: f"Missing tag columns: {missing_columns}. Make sure all specified columns to drop are present or use strict=False to ignore missing columns" ) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_schema, packet_schema = stream.types( - include_system_tags=include_system_tags + tag_schema, packet_schema = stream.output_schema( + columns=columns, all_info=all_info ) tag_columns, _ = stream.keys() new_tag_columns = [c for c in tag_columns if c not in self.columns] @@ -231,12 +243,12 @@ def op_output_types( return new_tag_schema, packet_schema - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.columns, self.strict, - ) + ((stream,) if stream is not None else ()) + ) class DropPacketColumns(UnaryOperator): @@ -251,7 +263,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() columns_to_drop = list(self.columns) if not self.strict: @@ -268,7 +280,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: columns_to_drop.extend(associated_source_columns) table = stream.as_table( - include_source=True, include_system_tags=True, sort_by_tags=False + columns={"source": True, "system_tags": True, "sort_by_tags": False} ) modified_table = table.drop_columns(columns_to_drop) @@ -280,7 +292,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: upstreams=(stream,), ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -293,24 +305,29 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: f"Missing packet columns: {missing_columns}. Make sure all specified columns to drop are present or use strict=False to ignore missing columns" ) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_schema, packet_schema = stream.types( - include_system_tags=include_system_tags + tag_schema, packet_schema = stream.output_schema( + columns=columns, all_info=all_info ) + new_packet_schema = { k: v for k, v in packet_schema.items() if k not in self.columns } return tag_schema, new_packet_schema - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.columns, self.strict, - ) + ((stream,) if stream is not None else ()) + ) class MapTags(UnaryOperator): @@ -327,7 +344,7 @@ def __init__( self.drop_unmapped = drop_unmapped super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() missing_tags = set(tag_columns) - set(self.name_map.keys()) @@ -335,7 +352,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: # nothing to rename in the tags, return stream as is return stream - table = stream.as_table(include_source=True, include_system_tags=True) + table = stream.as_table(columns={"source": True, "system_tags": True}) name_map = { tc: self.name_map.get(tc, tc) for tc in tag_columns @@ -354,7 +371,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -379,11 +396,15 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: message += f"overlapping packet columns: {overlapping_packet_columns}." raise InputValidationError(message) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, packet_typespec = stream.types( - include_system_tags=include_system_tags + tag_typespec, packet_typespec = stream.output_schema( + columns=columns, all_info=all_info ) # Create new packet typespec with renamed keys @@ -391,9 +412,9 @@ def op_output_types( return new_tag_typespec, packet_typespec - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.name_map, self.drop_unmapped, - ) + ((stream,) if stream is not None else ()) + ) diff --git a/src/orcapod/core/operators/filters.py b/src/orcapod/core/operators/filters.py index 2edf4f7..4a69032 100644 --- a/src/orcapod/core/operators/filters.py +++ b/src/orcapod/core/operators/filters.py @@ -1,21 +1,20 @@ -from orcapod.protocols import core_protocols as cp +import logging +from collections.abc import Collection, Iterable, Mapping +from typing import TYPE_CHECKING, Any, TypeAlias + +from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream +from orcapod.core.system_constants import constants +from orcapod.errors import InputValidationError +from orcapod.protocols.core_protocols import ColumnConfig, Stream from orcapod.types import PythonSchema -from typing import Any, TYPE_CHECKING, TypeAlias from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection, Mapping -from orcapod.errors import InputValidationError -from orcapod.core.system_constants import constants -from orcapod.core.operators.base import UnaryOperator -import logging -from collections.abc import Iterable - if TYPE_CHECKING: - import pyarrow as pa + import numpy as np import polars as pl import polars._typing as pl_type - import numpy as np + import pyarrow as pa else: pa = LazyModule("pyarrow") pl = LazyModule("polars") @@ -43,7 +42,7 @@ def __init__( self.constraints = constraints if constraints is not None else {} super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: if len(self.predicates) == 0 and len(self.constraints) == 0: logger.info( "No predicates or constraints specified. Returning stream unaltered." @@ -52,39 +51,43 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: # TODO: improve efficiency here... table = stream.as_table( - include_source=True, include_system_tags=True, sort_by_tags=False + columns={"source": True, "system_tags": True, "sort_by_tags": False} ) df = pl.DataFrame(table) filtered_table = df.filter(*self.predicates, **self.constraints).to_arrow() return TableStream( filtered_table, - tag_columns=stream.tag_keys(), + tag_columns=stream.keys()[0], source=self, upstreams=(stream,), ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. """ - # Any valid stream would work return - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + include_system_tags: bool = False, ) -> tuple[PythonSchema, PythonSchema]: # data types are not modified - return stream.types(include_system_tags=include_system_tags) + return stream.output_schema(columns=columns, all_info=all_info) - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.predicates, self.constraints, - ) + ((stream,) if stream is not None else ()) + ) class SelectPacketColumns(UnaryOperator): @@ -99,7 +102,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() packet_columns_to_drop = [c for c in packet_columns if c not in self.columns] new_packet_columns = [ @@ -111,7 +114,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: return stream table = stream.as_table( - include_source=True, include_system_tags=True, sort_by_tags=False + columns={"source": True, "system_tags": True, "sort_by_tags": False} ) # make sure to drop associated source fields associated_source_fields = [ @@ -128,13 +131,13 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: upstreams=(stream,), ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. """ # TODO: remove redundant logic - tag_columns, packet_columns = stream.keys() + _, packet_columns = stream.keys() columns_to_select = self.columns missing_columns = set(columns_to_select) - set(packet_columns) if missing_columns and self.strict: @@ -142,11 +145,16 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: f"Missing packet columns: {missing_columns}. Make sure all specified columns to select are present or use strict=False to ignore missing columns" ) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + include_system_tags: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_schema, packet_schema = stream.types( - include_system_tags=include_system_tags + tag_schema, packet_schema = stream.output_schema( + columns=columns, all_info=all_info ) _, packet_columns = stream.keys() packets_to_drop = [pc for pc in packet_columns if pc not in self.columns] @@ -158,9 +166,9 @@ def op_output_types( return tag_schema, new_packet_schema - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.columns, self.strict, - ) + ((stream,) if stream is not None else ()) + ) diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py index 04c65ee..55901ff 100644 --- a/src/orcapod/core/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -1,17 +1,18 @@ -from orcapod.protocols import core_protocols as cp +from collections.abc import Collection +from typing import TYPE_CHECKING, Any + +from orcapod.core import arrow_data_utils +from orcapod.core.operators.base import NonZeroInputOperator from orcapod.core.streams import TableStream +from orcapod.errors import InputValidationError +from orcapod.protocols.core_protocols import ArgumentGroup, ColumnConfig, Stream from orcapod.types import PythonSchema -from orcapod.utils import types_utils -from typing import Any, TYPE_CHECKING +from orcapod.utils import schema_utils from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection -from orcapod.errors import InputValidationError -from orcapod.core.operators.base import NonZeroInputOperator -from orcapod.core import arrow_data_utils if TYPE_CHECKING: - import pyarrow as pa import polars as pl + import pyarrow as pa else: pa = LazyModule("pyarrow") pl = LazyModule("polars") @@ -26,40 +27,48 @@ def kernel_id(self) -> tuple[str, ...]: """ return (f"{self.__class__.__name__}",) - def op_validate_inputs(self, *streams: cp.Stream) -> None: + def validate_nonzero_inputs(self, *streams: Stream) -> None: try: - self.op_output_types(*streams) + self.output_schema(*streams) except Exception as e: # raise InputValidationError(f"Input streams are not compatible: {e}") from e raise e - def order_input_streams(self, *streams: cp.Stream) -> list[cp.Stream]: + def order_input_streams(self, *streams: Stream) -> list[Stream]: # order the streams based on their hashes to offer deterministic operation return sorted(streams, key=lambda s: s.content_hash().to_hex()) - def op_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False + def argument_symmetry(self, streams: Collection) -> ArgumentGroup: + return frozenset(streams) + + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: if len(streams) == 1: # If only one stream is provided, return its typespecs - return streams[0].types(include_system_tags=include_system_tags) + return streams[0].output_schema(columns=columns, all_info=all_info) # output type computation does NOT require consistent ordering of streams # TODO: consider performing the check always with system tags on stream = streams[0] - tag_typespec, packet_typespec = stream.types( - include_system_tags=include_system_tags + tag_typespec, packet_typespec = stream.output_schema( + columns=columns, all_info=all_info ) for other_stream in streams[1:]: - other_tag_typespec, other_packet_typespec = other_stream.types( - include_system_tags=include_system_tags + other_tag_typespec, other_packet_typespec = other_stream.output_schema( + columns=columns, all_info=all_info + ) + tag_typespec = schema_utils.union_typespecs( + tag_typespec, other_tag_typespec ) - tag_typespec = types_utils.union_typespecs(tag_typespec, other_tag_typespec) - intersection_packet_typespec = types_utils.intersection_typespecs( + intersection_packet_typespec = schema_utils.intersection_typespecs( packet_typespec, other_packet_typespec ) - packet_typespec = types_utils.union_typespecs( + packet_typespec = schema_utils.union_typespecs( packet_typespec, other_packet_typespec ) if intersection_packet_typespec: @@ -69,7 +78,7 @@ def op_output_types( return tag_typespec, packet_typespec - def op_forward(self, *streams: cp.Stream) -> cp.Stream: + def execute(self, *streams: Stream) -> Stream: """ Joins two streams together based on their tags. The resulting stream will contain all the tags from both streams. @@ -82,7 +91,7 @@ def op_forward(self, *streams: cp.Stream) -> cp.Stream: stream = streams[0] tag_keys, _ = [set(k) for k in stream.keys()] - table = stream.as_table(include_source=True, include_system_tags=True) + table = stream.as_table(columns={"source": True, "system_tags": True}) # trick to get cartesian product table = table.add_column(0, COMMON_JOIN_KEY, pa.array([0] * len(table))) table = arrow_data_utils.append_to_system_tags( @@ -93,7 +102,7 @@ def op_forward(self, *streams: cp.Stream) -> cp.Stream: for next_stream in streams[1:]: next_tag_keys, _ = next_stream.keys() next_table = next_stream.as_table( - include_source=True, include_system_tags=True + columns={"source": True, "system_tags": True} ) next_table = arrow_data_utils.append_to_system_tags( next_table, @@ -130,12 +139,8 @@ def op_forward(self, *streams: cp.Stream) -> cp.Stream: upstreams=streams, ) - def op_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - return ( - (self.__class__.__name__,) + (set(streams),) if streams is not None else () - ) + def identity_structure(self) -> Any: + return self.__class__.__name__ def __repr__(self) -> str: return "Join()" diff --git a/src/orcapod/core/operators/mappers.py b/src/orcapod/core/operators/mappers.py index 5500e1b..51fd7fc 100644 --- a/src/orcapod/core/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -1,12 +1,13 @@ -from orcapod.protocols import core_protocols as cp +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any + +from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream +from orcapod.core.system_constants import constants +from orcapod.errors import InputValidationError +from orcapod.protocols.core_protocols import ColumnConfig, Stream from orcapod.types import PythonSchema -from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from collections.abc import Mapping -from orcapod.errors import InputValidationError -from orcapod.core.system_constants import constants -from orcapod.core.operators.base import UnaryOperator if TYPE_CHECKING: import pyarrow as pa @@ -28,7 +29,7 @@ def __init__( self.drop_unmapped = drop_unmapped super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() unmapped_columns = set(packet_columns) - set(self.name_map.keys()) @@ -37,7 +38,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: return stream table = stream.as_table( - include_source=True, include_system_tags=True, sort_by_tags=False + columns={"source": True, "system_tags": True, "sort_by_tags": False} ) name_map = { @@ -68,11 +69,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: renamed_table, tag_columns=tag_columns, source=self, upstreams=(stream,) ) - def op_validate_inputs(self, stream: cp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ + def validate_unary_input(self, stream: Stream) -> None: # verify that renamed value does NOT collide with other columns tag_columns, packet_columns = stream.keys() relevant_source = [] @@ -95,11 +92,15 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: message += f"overlapping tag columns: {overlapping_tag_columns}." raise InputValidationError(message) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, packet_typespec = stream.types( - include_system_tags=include_system_tags + tag_typespec, packet_typespec = stream.output_schema( + columns=columns, all_info=all_info ) # Create new packet typespec with renamed keys @@ -111,12 +112,12 @@ def op_output_types( return tag_typespec, new_packet_typespec - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.name_map, self.drop_unmapped, - ) + ((stream,) if stream is not None else ()) + ) class MapTags(UnaryOperator): @@ -133,7 +134,7 @@ def __init__( self.drop_unmapped = drop_unmapped super().__init__(**kwargs) - def op_forward(self, stream: cp.Stream) -> cp.Stream: + def unary_execute(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() missing_tags = set(tag_columns) - set(self.name_map.keys()) @@ -141,7 +142,9 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: # nothing to rename in the tags, return stream as is return stream - table = stream.as_table(include_source=True, include_system_tags=True) + table = stream.as_table( + columns={"source": True, "system_tags": True, "sort_by_tags": False} + ) name_map = { tc: self.name_map.get(tc, tc) @@ -162,7 +165,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) ) - def op_validate_inputs(self, stream: cp.Stream) -> None: + def validate_unary_input(self, stream: Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -187,11 +190,16 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: message += f"overlapping packet columns: {overlapping_packet_columns}." raise InputValidationError(message) - def op_output_types( - self, stream: cp.Stream, include_system_tags: bool = False + def unary_output_schema( + self, + stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + include_system_tags: bool = False, ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, packet_typespec = stream.types( - include_system_tags=include_system_tags + tag_typespec, packet_typespec = stream.output_schema( + columns=columns, all_info=all_info ) # Create new packet typespec with renamed keys @@ -208,9 +216,9 @@ def op_output_types( return new_tag_typespec, packet_typespec - def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + def identity_structure(self) -> Any: return ( self.__class__.__name__, self.name_map, self.drop_unmapped, - ) + ((stream,) if stream is not None else ()) + ) diff --git a/src/orcapod/core/operators/semijoin.py b/src/orcapod/core/operators/semijoin.py index 6cdff4c..5049409 100644 --- a/src/orcapod/core/operators/semijoin.py +++ b/src/orcapod/core/operators/semijoin.py @@ -1,11 +1,12 @@ -from orcapod.protocols import core_protocols as cp +from typing import TYPE_CHECKING, Any + +from orcapod.core.operators.base import BinaryOperator from orcapod.core.streams import TableStream -from orcapod.utils import types_utils +from orcapod.errors import InputValidationError +from orcapod.protocols.core_protocols import ColumnConfig, Stream from orcapod.types import PythonSchema -from typing import Any, TYPE_CHECKING +from orcapod.utils import schema_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.errors import InputValidationError -from orcapod.core.operators.base import BinaryOperator if TYPE_CHECKING: import pyarrow as pa @@ -27,47 +28,24 @@ class SemiJoin(BinaryOperator): The output stream preserves the schema of the left stream exactly. """ - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Returns a unique identifier for the kernel. - This is used to identify the kernel in the computational graph. - """ - return (f"{self.__class__.__name__}",) - - def op_identity_structure( - self, - left_stream: cp.Stream | None = None, - right_stream: cp.Stream | None = None, - ) -> Any: - """ - Return a structure that represents the identity of this operator. - Unlike Join, SemiJoin depends on the order of streams (left vs right). - """ - id_struct = (self.__class__.__name__,) - if left_stream is not None and right_stream is not None: - # Order matters for semi-join: (left_stream, right_stream) - id_struct += (left_stream, right_stream) - return id_struct - - def op_forward(self, left_stream: cp.Stream, right_stream: cp.Stream) -> cp.Stream: + def binary_execute(self, left_stream: Stream, right_stream: Stream) -> Stream: """ Performs a semi-join between left and right streams. Returns entries from left stream that have matching entries in right stream. """ - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() + left_tag_schema, left_packet_schema = left_stream.output_schema() + right_tag_schema, right_packet_schema = right_stream.output_schema() # Find overlapping columns across all columns (tags + packets) - left_all_typespec = types_utils.union_typespecs( - left_tag_typespec, left_packet_typespec + left_all_typespec = schema_utils.union_typespecs( + left_tag_schema, left_packet_schema ) - right_all_typespec = types_utils.union_typespecs( - right_tag_typespec, right_packet_typespec + right_all_typespec = schema_utils.union_typespecs( + right_tag_schema, right_packet_schema ) common_keys = tuple( - types_utils.intersection_typespecs( + schema_utils.intersection_typespecs( left_all_typespec, right_all_typespec ).keys() ) @@ -77,7 +55,7 @@ def op_forward(self, left_stream: cp.Stream, right_stream: cp.Stream) -> cp.Stre return left_stream # include source info for left stream - left_table = left_stream.as_table(include_source=True) + left_table = left_stream.as_table(columns={"source": True}) # Get the right table for matching right_table = right_stream.as_table() @@ -91,50 +69,50 @@ def op_forward(self, left_stream: cp.Stream, right_stream: cp.Stream) -> cp.Stre return TableStream( semi_joined_table, - tag_columns=tuple(left_tag_typespec.keys()), + tag_columns=tuple(left_tag_schema.keys()), source=self, upstreams=(left_stream, right_stream), ) - def op_output_types( + def binary_output_schema( self, - left_stream: cp.Stream, - right_stream: cp.Stream, - include_system_tags: bool = False, + left_stream: Stream, + right_stream: Stream, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: """ Returns the output types for the semi-join operation. The output preserves the exact schema of the left stream. """ # Semi-join preserves the left stream's schema exactly - return left_stream.types(include_system_tags=include_system_tags) + return left_stream.output_schema(columns=columns, all_info=all_info) - def op_validate_inputs( - self, left_stream: cp.Stream, right_stream: cp.Stream - ) -> None: + def validate_binary_inputs(self, left_stream: Stream, right_stream: Stream) -> None: """ Validates that the input streams are compatible for semi-join. Checks that overlapping columns have compatible types. """ try: - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() + left_tag_typespec, left_packet_typespec = left_stream.output_schema() + right_tag_typespec, right_packet_typespec = right_stream.output_schema() # Check that overlapping columns have compatible types across all columns - left_all_typespec = types_utils.union_typespecs( + left_all_typespec = schema_utils.union_typespecs( left_tag_typespec, left_packet_typespec ) - right_all_typespec = types_utils.union_typespecs( + right_all_typespec = schema_utils.union_typespecs( right_tag_typespec, right_packet_typespec ) # intersection_typespecs will raise an error if types are incompatible - types_utils.intersection_typespecs(left_all_typespec, right_all_typespec) + schema_utils.intersection_typespecs(left_all_typespec, right_all_typespec) except Exception as e: raise InputValidationError( f"Input streams are not compatible for semi-join: {e}" ) from e - def __repr__(self) -> str: - return "SemiJoin()" + def identity_structure(self) -> Any: + return self.__class__.__name__ diff --git a/src/orcapod/core/packet_function.py b/src/orcapod/core/packet_function.py new file mode 100644 index 0000000..ba02085 --- /dev/null +++ b/src/orcapod/core/packet_function.py @@ -0,0 +1,307 @@ +import hashlib +import logging +import re +import sys +from abc import abstractmethod +from collections.abc import Callable, Collection, Iterable, Sequence +from typing import TYPE_CHECKING, Any, Literal + +from orcapod.core.base import OrcapodBase +from orcapod.core.datagrams import DictPacket +from orcapod.hashing.hash_utils import get_function_components, get_function_signature +from orcapod.protocols.core_protocols import Packet +from orcapod.types import DataValue, PythonSchema, PythonSchemaLike +from orcapod.utils import schema_utils +from orcapod.utils.git_utils import get_git_info_for_python_object +from orcapod.utils.lazy_module import LazyModule + + +def process_function_output(self, values: Any) -> dict[str, DataValue]: + output_values = [] + if len(self.output_keys) == 0: + output_values = [] + elif len(self.output_keys) == 1: + output_values = [values] # type: ignore + elif isinstance(values, Iterable): + output_values = list(values) # type: ignore + elif len(self.output_keys) > 1: + raise ValueError( + "Values returned by function must be a pathlike or a sequence of pathlikes" + ) + + if len(output_values) != len(self.output_keys): + raise ValueError( + f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" + ) + + return {k: v for k, v in zip(self.output_keys, output_values)} + + +# TODO: extract default char count as config +def combine_hashes( + *hashes: str, + order: bool = False, + prefix_hasher_id: bool = False, + hex_char_count: int | None = 20, +) -> str: + """Combine hashes into a single hash string.""" + + # Sort for deterministic order regardless of input order + if order: + prepared_hashes = sorted(hashes) + else: + prepared_hashes = list(hashes) + combined = "".join(prepared_hashes) + combined_hash = hashlib.sha256(combined.encode()).hexdigest() + if hex_char_count is not None: + combined_hash = combined_hash[:hex_char_count] + if prefix_hasher_id: + return "sha256@" + combined_hash + return combined_hash + + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + +logger = logging.getLogger(__name__) + +error_handling_options = Literal["raise", "ignore", "warn"] + + +class PacketFunctionBase(OrcapodBase): + """ + Abstract base class for PacketFunction, defining the interface and common functionality. + """ + + def __init__(self, version: str = "v0.0", **kwargs): + super().__init__(**kwargs) + self._active = True + self._version = version + + match = re.match(r"\D.*(\d+)", version) + if match: + self._major_version = int(match.group(1)) + self._minor_version = version[match.end(1) :] + else: + raise ValueError( + f"Version string {version} does not contain a valid version number" + ) + + def identity_structure(self) -> Any: + return self.get_function_variation_data() + + @property + def major_version(self) -> int: + return self._major_version + + @property + def minor_version_string(self) -> str: + return self._minor_version + + @property + @abstractmethod + def packet_function_type_id(self) -> str: + """ + Unique function type identifier + """ + ... + + @property + @abstractmethod + def canonical_function_name(self) -> str: + """ + Human-readable function identifier + """ + ... + + @property + @abstractmethod + def input_packet_schema(self) -> PythonSchema: + """ + Return the input typespec for the pod. This is used to validate the input streams. + """ + ... + + @property + @abstractmethod + def output_packet_schema(self) -> PythonSchema: + """ + Return the output typespec for the pod. This is used to validate the output streams. + """ + ... + + @abstractmethod + def get_function_variation_data(self) -> dict[str, Any]: + """Raw data defining function variation - system computes hash""" + ... + + @abstractmethod + def get_execution_data(self) -> dict[str, Any]: + """Raw data defining execution context - system computes hash""" + ... + + @abstractmethod + def call(self, packet: Packet) -> Packet | None: + """ + Process the input packet and return the output packet. + """ + ... + + @abstractmethod + async def async_call(self, packet: Packet) -> Packet | None: + """ + Asynchronously process the input packet and return the output packet. + """ + ... + + +class PythonPacketFunction(PacketFunctionBase): + @property + def packet_function_type_id(self) -> str: + """ + Unique function type identifier + """ + return "python.function.v0" + + @property + def canonical_function_name(self) -> str: + """ + Human-readable function identifier + """ + return self._function_name + + def __init__( + self, + function: Callable[..., Any], + output_keys: str | Collection[str] | None = None, + function_name: str | None = None, + version: str = "v0.0", + input_schema: PythonSchemaLike | None = None, + output_schema: PythonSchemaLike | Sequence[type] | None = None, + label: str | None = None, + **kwargs, + ) -> None: + self._function = function + + if output_keys is None: + output_keys = [] + if isinstance(output_keys, str): + output_keys = [output_keys] + self._output_keys = output_keys + if function_name is None: + if hasattr(self._function, "__name__"): + function_name = getattr(self._function, "__name__") + else: + raise ValueError( + "function_name must be provided if function has no __name__" + ) + + assert function_name is not None + self._function_name = function_name + + super().__init__(label=label or self._function_name, version=version, **kwargs) + + # extract input and output schema from the function signature + input_schema, output_schema = schema_utils.extract_function_typespecs( + self._function, + self._output_keys, + input_typespec=input_schema, + output_typespec=output_schema, + ) + + # get git info for the function + # TODO: turn this into optional addition + env_info = get_git_info_for_python_object(self._function) + if env_info is None: + git_hash = "unknown" + else: + git_hash = env_info.get("git_commit_hash", "unknown") + if env_info.get("git_repo_status") == "dirty": + git_hash += "-dirty" + self._git_hash = git_hash + + self._input_schema = input_schema + self._output_schema = output_schema + + object_hasher = self.data_context.object_hasher + self._function_signature_hash = object_hasher.hash_object( + get_function_signature(function) + ).to_string() + self._function_content_hash = object_hasher.hash_object( + get_function_components(self._function) + ).to_string() + self._output_schema_hash = object_hasher.hash_object( + self.output_packet_schema + ).to_string() + + def get_function_variation_data(self) -> dict[str, Any]: + """Raw data defining function variation - system computes hash""" + return { + "function_name": self._function_name, + "function_signature_hash": self._function_signature_hash, + "function_content_hash": self._function_content_hash, + "git_hash": self._git_hash, + } + + def get_execution_data(self) -> dict[str, Any]: + """Raw data defining execution context - system computes hash""" + python_version_info = sys.version_info + python_version_str = f"{python_version_info.major}.{python_version_info.minor}.{python_version_info.micro}" + return {"python_version": python_version_str, "execution_context": "local"} + + @property + def input_packet_schema(self) -> PythonSchema: + """ + Return the input typespec for the pod. This is used to validate the input streams. + """ + return self._input_schema + + @property + def output_packet_schema(self) -> PythonSchema: + """ + Return the output typespec for the pod. This is used to validate the output streams. + """ + return self._output_schema + + def is_active(self) -> bool: + """ + Check if the pod is active. If not, it will not process any packets. + """ + return self._active + + def set_active(self, active: bool = True) -> None: + """ + Set the active state of the pod. If set to False, the pod will not process any packets. + """ + self._active = active + + def call(self, packet: Packet) -> Packet | None: + if not self._active: + return None + values = self._function(**packet.as_dict()) + output_values = [] + + if len(self._output_keys) == 0: + output_values = [] + elif len(self._output_keys) == 1: + output_values = [values] # type: ignore + elif isinstance(values, Iterable): + output_values = list(values) # type: ignore + elif len(self._output_keys) > 1: + raise ValueError( + "Values returned by function must be sequence-like if multiple output keys are specified" + ) + + if len(output_values) != len(self._output_keys): + raise ValueError( + f"Number of output keys {len(self._output_keys)}:{self._output_keys} does not match number of values returned by function {len(output_values)}" + ) + + return DictPacket({k: v for k, v in zip(self._output_keys, output_values)}) + + async def async_call(self, packet: Packet) -> Packet | None: + raise NotImplementedError("Async call not implemented for synchronous function") diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/pods.py deleted file mode 100644 index 02d3aa4..0000000 --- a/src/orcapod/core/pods.py +++ /dev/null @@ -1,905 +0,0 @@ -import hashlib -import logging -from abc import abstractmethod -from collections.abc import Callable, Collection, Iterable, Sequence -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any, Literal, Protocol, cast - -from orcapod import contexts -from orcapod.core.datagrams import ( - ArrowPacket, - DictPacket, -) -from orcapod.utils.git_utils import get_git_info_for_python_object -from orcapod.core.kernels import KernelStream, TrackedKernelBase -from orcapod.core.operators import Join -from orcapod.core.streams import CachedPodStream, LazyPodResultStream -from orcapod.core.system_constants import constants -from orcapod.hashing.hash_utils import get_function_components, get_function_signature -from orcapod.protocols import core_protocols as cp -from orcapod.protocols import hashing_protocols as hp -from orcapod.protocols.database_protocols import ArrowDatabase -from orcapod.types import DataValue, PythonSchema, PythonSchemaLike -from orcapod.utils import types_utils -from orcapod.utils.lazy_module import LazyModule - - -# TODO: extract default char count as config -def combine_hashes( - *hashes: str, - order: bool = False, - prefix_hasher_id: bool = False, - hex_char_count: int | None = 20, -) -> str: - """Combine hashes into a single hash string.""" - - # Sort for deterministic order regardless of input order - if order: - prepared_hashes = sorted(hashes) - else: - prepared_hashes = list(hashes) - combined = "".join(prepared_hashes) - combined_hash = hashlib.sha256(combined.encode()).hexdigest() - if hex_char_count is not None: - combined_hash = combined_hash[:hex_char_count] - if prefix_hasher_id: - return "sha256@" + combined_hash - return combined_hash - - -if TYPE_CHECKING: - import pyarrow as pa - import pyarrow.compute as pc -else: - pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") - -logger = logging.getLogger(__name__) - -error_handling_options = Literal["raise", "ignore", "warn"] - - -class ActivatablePodBase(TrackedKernelBase): - """ - FunctionPod is a specialized kernel that encapsulates a function to be executed on data streams. - It allows for the execution of a function with a specific label and can be tracked by the system. - """ - - @abstractmethod - def input_packet_types(self) -> PythonSchema: - """ - Return the input typespec for the pod. This is used to validate the input streams. - """ - ... - - @abstractmethod - def output_packet_types(self) -> PythonSchema: - """ - Return the output typespec for the pod. This is used to validate the output streams. - """ - ... - - @property - def version(self) -> str: - return self._version - - @abstractmethod - def get_record_id(self, packet: cp.Packet, execution_engine_hash: str) -> str: - """ - Return the record ID for the input packet. This is used to identify the pod in the system. - """ - ... - - @property - @abstractmethod - def tiered_pod_id(self) -> dict[str, str]: - """ - Return the tiered pod ID for the pod. This is used to identify the pod in a tiered architecture. - """ - ... - - def __init__( - self, - error_handling: error_handling_options = "raise", - label: str | None = None, - version: str = "v0.0", - **kwargs, - ) -> None: - super().__init__(label=label, **kwargs) - self._active = True - self.error_handling = error_handling - self._version = version - import re - - match = re.match(r"\D.*(\d+)", version) - major_version = 0 - if match: - major_version = int(match.group(1)) - else: - raise ValueError( - f"Version string {version} does not contain a valid version number" - ) - self.skip_type_checking = False - self._major_version = major_version - - @property - def major_version(self) -> int: - return self._major_version - - def kernel_output_types( - self, *streams: cp.Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Return the input and output typespecs for the pod. - This is used to validate the input and output streams. - """ - tag_typespec, _ = streams[0].types(include_system_tags=include_system_tags) - return tag_typespec, self.output_packet_types() - - def is_active(self) -> bool: - """ - Check if the pod is active. If not, it will not process any packets. - """ - return self._active - - def set_active(self, active: bool) -> None: - """ - Set the active state of the pod. If set to False, the pod will not process any packets. - """ - self._active = active - - @staticmethod - def _join_streams(*streams: cp.Stream) -> cp.Stream: - if not streams: - raise ValueError("No streams provided for joining") - # Join the streams using a suitable join strategy - if len(streams) == 1: - return streams[0] - - joined_stream = streams[0] - for next_stream in streams[1:]: - joined_stream = Join()(joined_stream, next_stream) - return joined_stream - - def pre_kernel_processing(self, *streams: cp.Stream) -> tuple[cp.Stream, ...]: - """ - Prepare the incoming streams for execution in the pod. At least one stream must be present. - If more than one stream is present, the join of the provided streams will be returned. - """ - # if multiple streams are provided, join them - # otherwise, return as is - if len(streams) <= 1: - return streams - - output_stream = self._join_streams(*streams) - return (output_stream,) - - def validate_inputs(self, *streams: cp.Stream) -> None: - if len(streams) != 1: - raise ValueError( - f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" - ) - if self.skip_type_checking: - return - input_stream = streams[0] - _, incoming_packet_types = input_stream.types() - if not types_utils.check_typespec_compatibility( - incoming_packet_types, self.input_packet_types() - ): - # TODO: use custom exception type for better error handling - raise ValueError( - f"Incoming packet data type {incoming_packet_types} from {input_stream} is not compatible with expected input typespec {self.input_packet_types()}" - ) - - def prepare_output_stream( - self, *streams: cp.Stream, label: str | None = None - ) -> KernelStream: - return KernelStream(source=self, upstreams=streams, label=label) - - def forward(self, *streams: cp.Stream) -> cp.Stream: - assert len(streams) == 1, "PodBase.forward expects exactly one input stream" - return LazyPodResultStream(pod=self, prepared_stream=streams[0]) - - @abstractmethod - def call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - ) -> tuple[cp.Tag, cp.Packet | None]: ... - - @abstractmethod - async def async_call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - ) -> tuple[cp.Tag, cp.Packet | None]: ... - - def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: - if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_pod_invocation(self, streams, label=label) - - -class CallableWithPod(Protocol): - def __call__(self, *args, **kwargs) -> Any: ... - - @property - def pod(self) -> "FunctionPod": ... - - -def function_pod( - output_keys: str | Collection[str] | None = None, - function_name: str | None = None, - version: str = "v0.0", - label: str | None = None, - **kwargs, -) -> Callable[..., CallableWithPod]: - """ - Decorator that attaches FunctionPod as pod attribute. - - Args: - output_keys: Keys for the function output(s) - function_name: Name of the function pod; if None, defaults to the function name - **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. - - Returns: - CallableWithPod: Decorated function with `pod` attribute holding the FunctionPod instance - """ - - def decorator(func: Callable) -> CallableWithPod: - if func.__name__ == "": - raise ValueError("Lambda functions cannot be used with function_pod") - - # Store the original function in the module for pickling purposes - # and make sure to change the name of the function - - # Create a simple typed function pod - pod = FunctionPod( - function=func, - output_keys=output_keys, - function_name=function_name or func.__name__, - version=version, - label=label, - **kwargs, - ) - setattr(func, "pod", pod) - return cast(CallableWithPod, func) - - return decorator - - -class FunctionPod(ActivatablePodBase): - def __init__( - self, - function: cp.PodFunction, - output_keys: str | Collection[str] | None = None, - function_name=None, - version: str = "v0.0", - input_python_schema: PythonSchemaLike | None = None, - output_python_schema: PythonSchemaLike | Sequence[type] | None = None, - label: str | None = None, - function_info_extractor: hp.FunctionInfoExtractor | None = None, - **kwargs, - ) -> None: - self.function = function - - if output_keys is None: - output_keys = [] - if isinstance(output_keys, str): - output_keys = [output_keys] - self.output_keys = output_keys - if function_name is None: - if hasattr(self.function, "__name__"): - function_name = getattr(self.function, "__name__") - else: - raise ValueError( - "function_name must be provided if function has no __name__ attribute" - ) - self.function_name = function_name - # extract the first full index (potentially with leading 0) in the version string - if not isinstance(version, str): - raise TypeError(f"Version must be a string, got {type(version)}") - - super().__init__(label=label or self.function_name, version=version, **kwargs) - - # extract input and output types from the function signature - input_packet_types, output_packet_types = ( - types_utils.extract_function_typespecs( - self.function, - self.output_keys, - input_typespec=input_python_schema, - output_typespec=output_python_schema, - ) - ) - - # get git info for the function - env_info = get_git_info_for_python_object(self.function) - if env_info is None: - git_hash = "unknown" - else: - git_hash = env_info.get("git_commit_hash", "unknown") - if env_info.get("git_repo_status") == "dirty": - git_hash += "-dirty" - self._git_hash = git_hash - - self._input_packet_schema = dict(input_packet_types) - self._output_packet_schema = dict(output_packet_types) - # TODO: add output packet converter for speed up - - self._function_info_extractor = function_info_extractor - object_hasher = self.data_context.object_hasher - # TODO: fix and replace with object_hasher protocol specific methods - self._function_signature_hash = object_hasher.hash_object( - get_function_signature(self.function) - ).to_string() - self._function_content_hash = object_hasher.hash_object( - get_function_components(self.function) - ).to_string() - - self._output_packet_type_hash = object_hasher.hash_object( - self.output_packet_types() - ).to_string() - - self._total_pod_id_hash = object_hasher.hash_object( - self.tiered_pod_id - ).to_string() - - @property - def tiered_pod_id(self) -> dict[str, str]: - return { - "version": self.version, - "signature": self._function_signature_hash, - "content": self._function_content_hash, - "git_hash": self._git_hash, - } - - @property - def reference(self) -> tuple[str, ...]: - return ( - self.function_name, - self._output_packet_type_hash, - "v" + str(self.major_version), - ) - - def get_record_id( - self, - packet: cp.Packet, - execution_engine_hash: str, - ) -> str: - return combine_hashes( - str(packet.content_hash()), - self._total_pod_id_hash, - execution_engine_hash, - prefix_hasher_id=True, - ) - - def input_packet_types(self) -> PythonSchema: - """ - Return the input typespec for the function pod. - This is used to validate the input streams. - """ - return self._input_packet_schema.copy() - - def output_packet_types(self) -> PythonSchema: - """ - Return the output typespec for the function pod. - This is used to validate the output streams. - """ - return self._output_packet_schema.copy() - - def __repr__(self) -> str: - return f"FunctionPod:{self.function_name}" - - def __str__(self) -> str: - include_module = self.function.__module__ != "__main__" - func_sig = get_function_signature( - self.function, - name_override=self.function_name, - include_module=include_module, - ) - return f"FunctionPod:{func_sig}" - - def call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - ) -> tuple[cp.Tag, DictPacket | None]: - if not self.is_active(): - logger.info( - f"Pod is not active: skipping computation on input packet {packet}" - ) - return tag, None - - execution_engine_hash = execution_engine.name if execution_engine else "default" - - # any kernel/pod invocation happening inside the function will NOT be tracked - if not isinstance(packet, dict): - input_dict = packet.as_dict(include_source=False) - else: - input_dict = packet - - with self._tracker_manager.no_tracking(): - if execution_engine is not None: - # use the provided execution engine to run the function - values = execution_engine.submit_sync(self.function, **input_dict) - else: - values = self.function(**input_dict) - - output_data = self.process_function_output(values) - - # TODO: extract out this function - def combine(*components: tuple[str, ...]) -> str: - inner_parsed = [":".join(component) for component in components] - return "::".join(inner_parsed) - - if record_id is None: - # if record_id is not provided, generate it from the packet - record_id = self.get_record_id(packet, execution_engine_hash) - source_info = { - k: combine(self.reference, (record_id,), (k,)) for k in output_data - } - - output_packet = DictPacket( - output_data, - source_info=source_info, - python_schema=self.output_packet_types(), - data_context=self.data_context, - ) - return tag, output_packet - - async def async_call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - ) -> tuple[cp.Tag, cp.Packet | None]: - """ - Asynchronous call to the function pod. This is a placeholder for future implementation. - Currently, it behaves like the synchronous call. - """ - if not self.is_active(): - logger.info( - f"Pod is not active: skipping computation on input packet {packet}" - ) - return tag, None - - execution_engine_hash = execution_engine.name if execution_engine else "default" - - # any kernel/pod invocation happening inside the function will NOT be tracked - # with self._tracker_manager.no_tracking(): - # FIXME: figure out how to properly make context manager work with async/await - # any kernel/pod invocation happening inside the function will NOT be tracked - if not isinstance(packet, dict): - input_dict = packet.as_dict(include_source=False) - else: - input_dict = packet - if execution_engine is not None: - # use the provided execution engine to run the function - values = await execution_engine.submit_async(self.function, **input_dict) - else: - values = self.function(**input_dict) - - output_data = self.process_function_output(values) - - # TODO: extract out this function - def combine(*components: tuple[str, ...]) -> str: - inner_parsed = [":".join(component) for component in components] - return "::".join(inner_parsed) - - if record_id is None: - # if record_id is not provided, generate it from the packet - record_id = self.get_record_id(packet, execution_engine_hash) - source_info = { - k: combine(self.reference, (record_id,), (k,)) for k in output_data - } - - output_packet = DictPacket( - output_data, - source_info=source_info, - python_schema=self.output_packet_types(), - data_context=self.data_context, - ) - return tag, output_packet - - def process_function_output(self, values: Any) -> dict[str, DataValue]: - output_values = [] - if len(self.output_keys) == 0: - output_values = [] - elif len(self.output_keys) == 1: - output_values = [values] # type: ignore - elif isinstance(values, Iterable): - output_values = list(values) # type: ignore - elif len(self.output_keys) > 1: - raise ValueError( - "Values returned by function must be a pathlike or a sequence of pathlikes" - ) - - if len(output_values) != len(self.output_keys): - raise ValueError( - f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" - ) - - return {k: v for k, v in zip(self.output_keys, output_values)} - - def kernel_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - id_struct = (self.__class__.__name__,) + self.reference - # if streams are provided, perform pre-processing step, validate, and add the - # resulting single stream to the identity structure - if streams is not None and len(streams) != 0: - id_struct += tuple(streams) - - return id_struct - - -class WrappedPod(ActivatablePodBase): - """ - A wrapper for an existing pod, allowing for additional functionality or modifications without changing the original pod. - This class is meant to serve as a base class for other pods that need to wrap existing pods. - Note that only the call logic is pass through to the wrapped pod, but the forward logic is not. - """ - - def __init__( - self, - pod: cp.Pod, - label: str | None = None, - data_context: str | contexts.DataContext | None = None, - **kwargs, - ) -> None: - # if data_context is not explicitly given, use that of the contained pod - if data_context is None: - data_context = pod.data_context_key - super().__init__( - label=label, - data_context=data_context, - **kwargs, - ) - self.pod = pod - - @property - def reference(self) -> tuple[str, ...]: - """ - Return the pod ID, which is the function name of the wrapped pod. - This is used to identify the pod in the system. - """ - return self.pod.reference - - def get_record_id(self, packet: cp.Packet, execution_engine_hash: str) -> str: - return self.pod.get_record_id(packet, execution_engine_hash) - - @property - def tiered_pod_id(self) -> dict[str, str]: - """ - Return the tiered pod ID for the wrapped pod. This is used to identify the pod in a tiered architecture. - """ - return self.pod.tiered_pod_id - - def computed_label(self) -> str | None: - return self.pod.label - - def input_packet_types(self) -> PythonSchema: - """ - Return the input typespec for the stored pod. - This is used to validate the input streams. - """ - return self.pod.input_packet_types() - - def output_packet_types(self) -> PythonSchema: - """ - Return the output typespec for the stored pod. - This is used to validate the output streams. - """ - return self.pod.output_packet_types() - - def validate_inputs(self, *streams: cp.Stream) -> None: - self.pod.validate_inputs(*streams) - - def call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - ) -> tuple[cp.Tag, cp.Packet | None]: - return self.pod.call( - tag, packet, record_id=record_id, execution_engine=execution_engine - ) - - async def async_call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - ) -> tuple[cp.Tag, cp.Packet | None]: - return await self.pod.async_call( - tag, packet, record_id=record_id, execution_engine=execution_engine - ) - - def kernel_identity_structure( - self, streams: Collection[cp.Stream] | None = None - ) -> Any: - return self.pod.identity_structure(streams) - - def __repr__(self) -> str: - return f"WrappedPod({self.pod!r})" - - def __str__(self) -> str: - return f"WrappedPod:{self.pod!s}" - - -class CachedPod(WrappedPod): - """ - A pod that caches the results of the wrapped pod. - This is useful for pods that are expensive to compute and can benefit from caching. - """ - - # name of the column in the tag store that contains the packet hash - DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" - - def __init__( - self, - pod: cp.Pod, - result_database: ArrowDatabase, - record_path_prefix: tuple[str, ...] = (), - match_tier: str | None = None, - retrieval_mode: Literal["latest", "most_specific"] = "latest", - **kwargs, - ): - super().__init__(pod, **kwargs) - self.record_path_prefix = record_path_prefix - self.result_database = result_database - self.match_tier = match_tier - self.retrieval_mode = retrieval_mode - self.mode: Literal["production", "development"] = "production" - - def set_mode(self, mode: str) -> None: - if mode not in ("production", "development"): - raise ValueError(f"Invalid mode: {mode}") - self.mode = mode - - @property - def version(self) -> str: - return self.pod.version - - @property - def record_path(self) -> tuple[str, ...]: - """ - Return the path to the record in the result store. - This is used to store the results of the pod. - """ - return self.record_path_prefix + self.reference - - def call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - skip_cache_lookup: bool = False, - skip_cache_insert: bool = False, - ) -> tuple[cp.Tag, cp.Packet | None]: - # TODO: consider logic for overwriting existing records - execution_engine_hash = execution_engine.name if execution_engine else "default" - if record_id is None: - record_id = self.get_record_id( - packet, execution_engine_hash=execution_engine_hash - ) - output_packet = None - if not skip_cache_lookup and self.mode == "production": - print("Checking for cache...") - output_packet = self.get_cached_output_for_packet(packet) - if output_packet is not None: - print(f"Cache hit for {packet}!") - if output_packet is None: - tag, output_packet = super().call( - tag, packet, record_id=record_id, execution_engine=execution_engine - ) - if ( - output_packet is not None - and not skip_cache_insert - and self.mode == "production" - ): - self.record_packet(packet, output_packet, record_id=record_id) - - return tag, output_packet - - async def async_call( - self, - tag: cp.Tag, - packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - skip_cache_lookup: bool = False, - skip_cache_insert: bool = False, - ) -> tuple[cp.Tag, cp.Packet | None]: - # TODO: consider logic for overwriting existing records - execution_engine_hash = execution_engine.name if execution_engine else "default" - - if record_id is None: - record_id = self.get_record_id( - packet, execution_engine_hash=execution_engine_hash - ) - output_packet = None - if not skip_cache_lookup: - output_packet = self.get_cached_output_for_packet(packet) - if output_packet is None: - tag, output_packet = await super().async_call( - tag, packet, record_id=record_id, execution_engine=execution_engine - ) - if output_packet is not None and not skip_cache_insert: - self.record_packet( - packet, - output_packet, - record_id=record_id, - execution_engine=execution_engine, - ) - - return tag, output_packet - - def forward(self, *streams: cp.Stream) -> cp.Stream: - assert len(streams) == 1, "PodBase.forward expects exactly one input stream" - return CachedPodStream(pod=self, input_stream=streams[0]) - - def record_packet( - self, - input_packet: cp.Packet, - output_packet: cp.Packet, - record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, - skip_duplicates: bool = False, - ) -> cp.Packet: - """ - Record the output packet against the input packet in the result store. - """ - data_table = output_packet.as_table(include_context=True, include_source=True) - - for i, (k, v) in enumerate(self.tiered_pod_id.items()): - # add the tiered pod ID to the data table - data_table = data_table.add_column( - i, - f"{constants.POD_ID_PREFIX}{k}", - pa.array([v], type=pa.large_string()), - ) - - # add the input packet hash as a column - data_table = data_table.add_column( - 0, - constants.INPUT_PACKET_HASH, - pa.array([str(input_packet.content_hash())], type=pa.large_string()), - ) - # add execution engine information - execution_engine_hash = execution_engine.name if execution_engine else "default" - data_table = data_table.append_column( - constants.EXECUTION_ENGINE, - pa.array([execution_engine_hash], type=pa.large_string()), - ) - - # add computation timestamp - timestamp = datetime.now(timezone.utc) - data_table = data_table.append_column( - constants.POD_TIMESTAMP, - pa.array([timestamp], type=pa.timestamp("us", tz="UTC")), - ) - - if record_id is None: - record_id = self.get_record_id( - input_packet, execution_engine_hash=execution_engine_hash - ) - - self.result_database.add_record( - self.record_path, - record_id, - data_table, - skip_duplicates=skip_duplicates, - ) - # if result_flag is None: - # # TODO: do more specific error handling - # raise ValueError( - # f"Failed to record packet {input_packet} in result store {self.result_store}" - # ) - # # TODO: make store return retrieved table - return output_packet - - def get_cached_output_for_packet(self, input_packet: cp.Packet) -> cp.Packet | None: - """ - Retrieve the output packet from the result store based on the input packet. - If more than one output packet is found, conflict resolution strategy - will be applied. - If the output packet is not found, return None. - """ - # result_table = self.result_store.get_record_by_id( - # self.record_path, - # self.get_entry_hash(input_packet), - # ) - - # get all records with matching the input packet hash - # TODO: add match based on match_tier if specified - constraints = {constants.INPUT_PACKET_HASH: str(input_packet.content_hash())} - if self.match_tier is not None: - constraints[f"{constants.POD_ID_PREFIX}{self.match_tier}"] = ( - self.pod.tiered_pod_id[self.match_tier] - ) - - result_table = self.result_database.get_records_with_column_value( - self.record_path, - constraints, - ) - if result_table is None or result_table.num_rows == 0: - return None - - if result_table.num_rows > 1: - logger.info( - f"Performing conflict resolution for multiple records for {input_packet.content_hash().display_name()}" - ) - if self.retrieval_mode == "latest": - result_table = result_table.sort_by( - self.DATA_RETRIEVED_FLAG, ascending=False - ).take([0]) - elif self.retrieval_mode == "most_specific": - # match by the most specific pod ID - # trying next level if not found - for k, v in reversed(self.tiered_pod_id.items()): - search_result = result_table.filter( - pc.field(f"{constants.POD_ID_PREFIX}{k}") == v - ) - if search_result.num_rows > 0: - result_table = search_result.take([0]) - break - if result_table.num_rows > 1: - logger.warning( - f"No matching record found for {input_packet.content_hash().display_name()} with tiered pod ID {self.tiered_pod_id}" - ) - result_table = result_table.sort_by( - self.DATA_RETRIEVED_FLAG, ascending=False - ).take([0]) - - else: - raise ValueError( - f"Unknown retrieval mode: {self.retrieval_mode}. Supported modes are 'latest' and 'most_specific'." - ) - - pod_id_columns = [ - f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() - ] - result_table = result_table.drop_columns(pod_id_columns) - result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) - - # note that data context will be loaded from the result store - return ArrowPacket( - result_table, - meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, - ) - - def get_all_cached_outputs( - self, include_system_columns: bool = False - ) -> "pa.Table | None": - """ - Get all records from the result store for this pod. - If include_system_columns is True, include system columns in the result. - """ - record_id_column = ( - constants.PACKET_RECORD_ID if include_system_columns else None - ) - result_table = self.result_database.get_all_records( - self.record_path, record_id_column=record_id_column - ) - if result_table is None or result_table.num_rows == 0: - return None - - if not include_system_columns: - # remove input packet hash and tiered pod ID columns - pod_id_columns = [ - f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() - ] - result_table = result_table.drop_columns(pod_id_columns) - result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) - - return result_table diff --git a/src/orcapod/core/polars_data_utils.py b/src/orcapod/core/polars_data_utils.py index 7757a1d..07284c4 100644 --- a/src/orcapod/core/polars_data_utils.py +++ b/src/orcapod/core/polars_data_utils.py @@ -1,8 +1,9 @@ # Collection of functions to work with Arrow table data that underlies streams and/or datagrams -from orcapod.utils.lazy_module import LazyModule +from collections.abc import Collection from typing import TYPE_CHECKING + from orcapod.core.system_constants import constants -from collections.abc import Collection +from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: import polars as pl diff --git a/src/orcapod/core/schema.py b/src/orcapod/core/schema.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py index 7d3c789..884f2cb 100644 --- a/src/orcapod/core/sources/arrow_table_source.py +++ b/src/orcapod/core/sources/arrow_table_source.py @@ -6,7 +6,7 @@ from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants from orcapod.core import arrow_data_utils from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry diff --git a/src/orcapod/core/sources/base.py b/src/orcapod/core/sources/base.py index 89c8ff9..2b8b8fb 100644 --- a/src/orcapod/core/sources/base.py +++ b/src/orcapod/core/sources/base.py @@ -4,12 +4,13 @@ from typing import TYPE_CHECKING, Any -from orcapod.core.kernels import TrackedKernelBase +from orcapod.core.executable_pod import TrackedKernelBase from orcapod.core.streams import ( KernelStream, StatefulStreamBase, ) from orcapod.protocols import core_protocols as cp +import orcapod.protocols.core_protocols.execution_engine from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule @@ -118,7 +119,8 @@ def __iter__(self) -> Iterator[tuple[cp.Tag, cp.Packet]]: def iter_packets( self, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().iter_packets(execution_engine=execution_engine) @@ -130,7 +132,8 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> "pa.Table": """Delegate to the cached KernelStream.""" return self().as_table( @@ -143,7 +146,9 @@ def as_table( ) def flow( - self, execution_engine: cp.ExecutionEngine | None = None + self, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> Collection[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().flow(execution_engine=execution_engine) @@ -151,7 +156,8 @@ def flow( def run( self, *args: Any, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, **kwargs: Any, ) -> None: """ @@ -164,7 +170,8 @@ def run( async def run_async( self, *args: Any, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, **kwargs: Any, ) -> None: """ @@ -338,7 +345,8 @@ def __iter__(self) -> Iterator[tuple[cp.Tag, cp.Packet]]: def iter_packets( self, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().iter_packets(execution_engine=execution_engine) @@ -350,7 +358,8 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> "pa.Table": """Delegate to the cached KernelStream.""" return self().as_table( @@ -363,7 +372,9 @@ def as_table( ) def flow( - self, execution_engine: cp.ExecutionEngine | None = None + self, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> Collection[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().flow(execution_engine=execution_engine) @@ -371,7 +382,8 @@ def flow( def run( self, *args: Any, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, **kwargs: Any, ) -> None: """ @@ -384,7 +396,8 @@ def run( async def run_async( self, *args: Any, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, **kwargs: Any, ) -> None: """ diff --git a/src/orcapod/core/sources/data_frame_source.py b/src/orcapod/core/sources/data_frame_source.py index 2fb4a78..c029926 100644 --- a/src/orcapod/core/sources/data_frame_source.py +++ b/src/orcapod/core/sources/data_frame_source.py @@ -5,7 +5,7 @@ from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants from orcapod.core import polars_data_utils from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry import logging diff --git a/src/orcapod/core/sources/dict_source.py b/src/orcapod/core/sources/dict_source.py index d291b3f..9c08b37 100644 --- a/src/orcapod/core/sources/dict_source.py +++ b/src/orcapod/core/sources/dict_source.py @@ -5,7 +5,7 @@ from orcapod.protocols import core_protocols as cp from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils.lazy_module import LazyModule -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants from orcapod.core.sources.arrow_table_source import ArrowTableSource if TYPE_CHECKING: diff --git a/src/orcapod/core/sources/list_source.py b/src/orcapod/core/sources/list_source.py index fdc7ffa..3d2d394 100644 --- a/src/orcapod/core/sources/list_source.py +++ b/src/orcapod/core/sources/list_source.py @@ -6,7 +6,7 @@ from pyarrow.lib import Table from orcapod.core.datagrams import DictTag -from orcapod.core.kernels import TrackedKernelBase +from orcapod.core.executable_pod import TrackedKernelBase from orcapod.core.streams import ( TableStream, KernelStream, @@ -17,7 +17,7 @@ from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants from orcapod.semantic_types import infer_python_schema_from_pylist_data if TYPE_CHECKING: diff --git a/src/orcapod/core/streams/__init__.py b/src/orcapod/core/streams/__init__.py index 9f1d625..2004bbe 100644 --- a/src/orcapod/core/streams/__init__.py +++ b/src/orcapod/core/streams/__init__.py @@ -1,10 +1,11 @@ -from .base import StatefulStreamBase -from .kernel_stream import KernelStream +# from .base import StatefulStreamBase +# from .pod_stream import KernelStream from .table_stream import TableStream -from .lazy_pod_stream import LazyPodResultStream -from .cached_pod_stream import CachedPodStream -from .wrapped_stream import WrappedStream -from .pod_node_stream import PodNodeStream + +# from .packet_processor_stream import LazyPodResultStream +# from .cached_packet_processor_stream import CachedPodStream +# from .wrapped_stream import WrappedStream +# from .pod_node_stream import PodNodeStream __all__ = [ diff --git a/src/orcapod/core/streams/base.py b/src/orcapod/core/streams/base.py index 8cb1bbb..5d91b28 100644 --- a/src/orcapod/core/streams/base.py +++ b/src/orcapod/core/streams/base.py @@ -1,24 +1,20 @@ import logging from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping -from datetime import datetime, timezone from typing import TYPE_CHECKING, Any -from orcapod import contexts -from orcapod.core.base import LabeledContentIdentifiableBase -from orcapod.protocols import core_protocols as cp +from orcapod.core.base import OrcapodBase +from orcapod.protocols.core_protocols import Pod, Stream, Tag, Packet, ColumnConfig from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: import pyarrow as pa - import pyarrow.compute as pc import polars as pl import pandas as pd else: pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") pl = LazyModule("polars") pd = LazyModule("pandas") @@ -29,8 +25,31 @@ logger = logging.getLogger(__name__) -class OperatorStreamBaseMixin: - def join(self, other_stream: cp.Stream, label: str | None = None) -> cp.Stream: +class StreamBase(OrcapodBase): + @property + @abstractmethod + def source(self) -> Pod | None: ... + + @property + @abstractmethod + def upstreams(self) -> tuple[Stream, ...]: ... + + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None + + def identity_structure(self) -> Any: + # Identity of a PodStream is determined by the pod and its upstreams + if self.source is None: + raise ValueError("Stream has no source pod for identity structure.") + return ( + self.source, + self.source.argument_symmetry(self.upstreams), + ) + + def join(self, other_stream: Stream, label: str | None = None) -> Stream: """ Joins this stream with another stream, returning a new stream that contains the combined data from both streams. @@ -41,9 +60,9 @@ def join(self, other_stream: cp.Stream, label: str | None = None) -> cp.Stream: def semi_join( self, - other_stream: cp.Stream, + other_stream: Stream, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: """ Performs a semi-join with another stream, returning a new stream that contains only the packets from this stream that have matching tags in the other stream. @@ -57,7 +76,7 @@ def map_tags( name_map: Mapping[str, str], drop_unmapped: bool = True, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: """ Maps the tags in this stream according to the provided name_map. If drop_unmapped is True, any tags that are not in the name_map will be dropped. @@ -71,7 +90,7 @@ def map_packets( name_map: Mapping[str, str], drop_unmapped: bool = True, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: """ Maps the packets in this stream according to the provided packet_map. If drop_unmapped is True, any packets that are not in the packet_map will be dropped. @@ -81,11 +100,11 @@ def map_packets( return MapPackets(name_map, drop_unmapped)(self, label=label) # type: ignore def batch( - self: cp.Stream, + self, batch_size: int = 0, drop_partial_batch: bool = False, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: """ Batch stream into fixed-size chunks, each of size batch_size. If drop_last is True, any remaining elements that don't fit into a full batch will be dropped. @@ -97,12 +116,12 @@ def batch( ) # type: ignore def polars_filter( - self: cp.Stream, + self, *predicates: Any, constraint_map: Mapping[str, Any] | None = None, label: str | None = None, **constraints: Any, - ) -> cp.Stream: + ) -> Stream: from orcapod.core.operators import PolarsFilter total_constraints = dict(constraint_map) if constraint_map is not None else {} @@ -114,11 +133,11 @@ def polars_filter( ) def select_tag_columns( - self: cp.Stream, + self, tag_columns: str | Collection[str], strict: bool = True, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: """ Select the specified tag columns from the stream. A ValueError is raised if one or more specified tag columns do not exist in the stream unless strict = False. @@ -128,11 +147,11 @@ def select_tag_columns( return SelectTagColumns(tag_columns, strict=strict)(self, label=label) def select_packet_columns( - self: cp.Stream, + self, packet_columns: str | Collection[str], strict: bool = True, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: """ Select the specified packet columns from the stream. A ValueError is raised if one or more specified packet columns do not exist in the stream unless strict = False. @@ -142,297 +161,114 @@ def select_packet_columns( return SelectPacketColumns(packet_columns, strict=strict)(self, label=label) def drop_tag_columns( - self: cp.Stream, + self, tag_columns: str | Collection[str], strict: bool = True, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: from orcapod.core.operators import DropTagColumns return DropTagColumns(tag_columns, strict=strict)(self, label=label) def drop_packet_columns( - self: cp.Stream, + self, packet_columns: str | Collection[str], strict: bool = True, label: str | None = None, - ) -> cp.Stream: + ) -> Stream: from orcapod.core.operators import DropPacketColumns return DropPacketColumns(packet_columns, strict=strict)(self, label=label) - -class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase): - """ - A stream that has a unique identity within the pipeline. - """ - - def pop(self) -> cp.Stream: - return self - - def __init__( - self, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self._last_modified: datetime | None = None - self._set_modified_time() - # note that this is not necessary for Stream protocol, but is provided - # for convenience to resolve semantic types and other context-specific information - self._execution_engine = execution_engine - - @property - def substream_identities(self) -> tuple[str, ...]: - """ - Returns the identities of the substreams that this stream is composed of. - This is used to identify the substreams in the computational graph. - """ - return (self.content_hash().to_hex(),) - - @property - def execution_engine(self) -> cp.ExecutionEngine | None: - """ - Returns the execution engine that is used to execute this stream. - This is typically used to track the execution context of the stream. - """ - return self._execution_engine - - @execution_engine.setter - def execution_engine(self, engine: cp.ExecutionEngine | None) -> None: - """ - Sets the execution engine for the stream. - This is typically used to track the execution context of the stream. - """ - self._execution_engine = engine - - def get_substream(self, substream_id: str) -> cp.Stream: - """ - Returns the substream with the given substream_id. - This is used to retrieve a specific substream from the stream. - """ - if substream_id == self.substream_identities[0]: - return self - else: - raise ValueError(f"Substream with ID {substream_id} not found.") - - @property - @abstractmethod - def source(self) -> cp.Kernel | None: - """ - The source of the stream, which is the kernel that generated the stream. - This is typically used to track the origin of the stream in the computational graph. - """ - ... - - @property - @abstractmethod - def upstreams(self) -> tuple[cp.Stream, ...]: - """ - The upstream streams that are used to generate this stream. - This is typically used to track the origin of the stream in the computational graph. - """ - ... - - def computed_label(self) -> str | None: - if self.source is not None: - # use the invocation operation label - return self.source.label - return None - @abstractmethod def keys( - self, include_system_tags: bool = False + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[tuple[str, ...], tuple[str, ...]]: ... - def tag_keys(self, include_system_tags: bool = False) -> tuple[str, ...]: - return self.keys(include_system_tags=include_system_tags)[0] - - def packet_keys(self) -> tuple[str, ...]: - return self.keys()[1] - @abstractmethod - def types( - self, include_system_tags: bool = False + def output_schema( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: ... - def tag_types(self, include_system_tags: bool = False) -> PythonSchema: - return self.types(include_system_tags=include_system_tags)[0] - - def packet_types(self) -> PythonSchema: - return self.types()[1] - - @property - def last_modified(self) -> datetime | None: - """ - Returns when the stream's content was last modified. - This is used to track the time when the stream was last accessed. - Returns None if the stream has not been accessed yet. - """ - return self._last_modified - - @property - def is_current(self) -> bool: - """ - Returns whether the stream is current. - A stream is current if the content is up-to-date with respect to its source. - This can be used to determine if a stream with non-None last_modified is up-to-date. - Note that for asynchronous streams, this status is not applicable and always returns False. - """ - if self.last_modified is None: - # If there is no last_modified timestamp, we cannot determine if the stream is current - return False - - # check if the source kernel has been modified - if self.source is not None and ( - self.source.last_modified is None - or self.source.last_modified > self.last_modified - ): - return False - - # check if all upstreams are current - for upstream in self.upstreams: - if ( - not upstream.is_current - or upstream.last_modified is None - or upstream.last_modified > self.last_modified - ): - return False - return True - - def _set_modified_time( - self, timestamp: datetime | None = None, invalidate: bool = False - ) -> None: - if invalidate: - self._last_modified = None - return - - if timestamp is not None: - self._last_modified = timestamp - else: - self._last_modified = datetime.now(timezone.utc) - def __iter__( self, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + ) -> Iterator[tuple[Tag, Packet]]: return self.iter_packets() @abstractmethod def iter_packets( self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: ... - - @abstractmethod - def run( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: ... - - @abstractmethod - async def run_async( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: ... + ) -> Iterator[tuple[Tag, Packet]]: ... @abstractmethod def as_table( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": ... def as_polars_df( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pl.DataFrame": """ Convert the entire stream to a Polars DataFrame. """ return pl.DataFrame( self.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, + columns=columns, + all_info=all_info, ) ) def as_df( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pl.DataFrame": """ Convert the entire stream to a Polars DataFrame. """ return self.as_polars_df( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, + columns=columns, + all_info=all_info, ) def as_lazy_frame( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pl.LazyFrame": """ Convert the entire stream to a Polars LazyFrame. """ df = self.as_polars_df( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, + columns=columns, + all_info=all_info, ) return df.lazy() def as_pandas_df( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - index_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + index_by_tags: bool = False, + all_info: bool = False, ) -> "pd.DataFrame": df = self.as_polars_df( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, + columns=columns, + all_info=all_info, ) tag_keys, _ = self.keys() pdf = df.to_pandas() @@ -441,46 +277,39 @@ def as_pandas_df( return pdf def flow( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Collection[tuple[cp.Tag, cp.Packet]]: + self, + ) -> Collection[tuple[Tag, Packet]]: """ Flow everything through the stream, returning the entire collection of (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. """ - return [e for e in self.iter_packets(execution_engine=execution_engine)] + return [e for e in self.iter_packets()] def _repr_html_(self) -> str: df = self.as_polars_df() - tag_map = {t: f"*{t}" for t in self.tag_keys()} + tag_map = {t: f"*{t}" for t in self.keys()[0]} # TODO: construct repr html better df = df.rename(tag_map) return f"{self.__class__.__name__}[{self.label}]\n" + df._repr_html_() def view( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "StreamView": df = self.as_polars_df( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, + columns=columns, + all_info=all_info, ) - tag_map = {t: f"*{t}" for t in self.tag_keys()} + tag_map = {t: f"*{t}" for t in self.keys()[0]} # TODO: construct repr html better df = df.rename(tag_map) return StreamView(self, df) class StreamView: - def __init__(self, stream: StatefulStreamBase, view_df: "pl.DataFrame") -> None: + def __init__(self, stream: StreamBase, view_df: "pl.DataFrame") -> None: self._stream = stream self._view_df = view_df @@ -489,130 +318,3 @@ def _repr_html_(self) -> str: f"{self._stream.__class__.__name__}[{self._stream.label}]\n" + self._view_df._repr_html_() ) - - # def identity_structure(self) -> Any: - # """ - # Identity structure of a stream is deferred to the identity structure - # of the associated invocation, if present. - # A bare stream without invocation has no well-defined identity structure. - # Specialized stream subclasses should override this method to provide more meaningful identity structure - # """ - # ... - - -class StreamBase(StatefulStreamBase): - """ - A stream is a collection of tagged-packets that are generated by an operation. - The stream is iterable and can be used to access the packets in the stream. - - A stream has property `invocation` that is an instance of Invocation that generated the stream. - This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). - """ - - def __init__( - self, - source: cp.Kernel | None = None, - upstreams: tuple[cp.Stream, ...] = (), - data_context: str | contexts.DataContext | None = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self._source = source - self._upstreams = upstreams - - # if data context is not provided, use that of the source kernel - if data_context is None and source is not None: - # if source is provided, use its data context - data_context = source.data_context_key - super().__init__(data_context=data_context, **kwargs) - - @property - def source(self) -> cp.Kernel | None: - """ - The source of the stream, which is the kernel that generated the stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._source - - @property - def upstreams(self) -> tuple[cp.Stream, ...]: - """ - The upstream streams that are used to generate this stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._upstreams - - def computed_label(self) -> str | None: - if self.source is not None: - # use the invocation operation label - return self.source.label - return None - - # @abstractmethod - # def iter_packets( - # self, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... - - # @abstractmethod - # def run( - # self, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> None: ... - - # @abstractmethod - # async def run_async( - # self, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> None: ... - - # @abstractmethod - # def as_table( - # self, - # include_data_context: bool = False, - # include_source: bool = False, - # include_system_tags: bool = False, - # include_content_hash: bool | str = False, - # sort_by_tags: bool = True, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> "pa.Table": ... - - def identity_structure(self) -> Any: - """ - Identity structure of a stream is deferred to the identity structure - of the associated invocation, if present. - A bare stream without invocation has no well-defined identity structure. - Specialized stream subclasses should override this method to provide more meaningful identity structure - """ - if self.source is not None: - # if the stream is generated by an operation, use the identity structure from the invocation - return self.source.identity_structure(self.upstreams) - return super().identity_structure() - - -class ImmutableStream(StreamBase): - """ - A class of stream that is constructed from immutable/constant data and does not change over time. - Consequently, the identity of an unsourced stream should be based on the content of the stream itself. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._data_content_identity = None - - @abstractmethod - def data_content_identity_structure(self) -> Any: - """ - Returns a hash of the content of the stream. - This is used to identify the content of the stream. - """ - ... - - def identity_structure(self) -> Any: - if self.source is not None: - # if the stream is generated by an operation, use the identity structure from the invocation - return self.source.identity_structure(self.upstreams) - # otherwise, use the content of the stream as the identity structure - if self._data_content_identity is None: - self._data_content_identity = self.data_content_identity_structure() - return self._data_content_identity diff --git a/src/orcapod/core/streams/cached_pod_stream.py b/src/orcapod/core/streams/cached_pod_stream.py deleted file mode 100644 index 6e667e9..0000000 --- a/src/orcapod/core/streams/cached_pod_stream.py +++ /dev/null @@ -1,461 +0,0 @@ -import logging -from collections.abc import Iterator -from typing import TYPE_CHECKING, Any - -from orcapod.core.system_constants import constants -from orcapod.protocols import core_protocols as cp -from orcapod.types import PythonSchema -from orcapod.utils import arrow_utils -from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import StreamBase -from orcapod.core.streams.table_stream import TableStream - - -if TYPE_CHECKING: - import pyarrow as pa - import pyarrow.compute as pc - import polars as pl - -else: - pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") - pl = LazyModule("polars") - - -# TODO: consider using this instead of making copy of dicts -# from types import MappingProxyType - -logger = logging.getLogger(__name__) - - -class CachedPodStream(StreamBase): - """ - A fixed stream that lazily processes packets from a prepared input stream. - This is what Pod.process() returns - it's static/fixed but efficient. - """ - - # TODO: define interface for storage or pod storage - def __init__(self, pod: cp.CachedPod, input_stream: cp.Stream, **kwargs): - super().__init__(source=pod, upstreams=(input_stream,), **kwargs) - self.pod = pod - self.input_stream = input_stream - self._set_modified_time() # set modified time to when we obtain the iterator - # capture the immutable iterator from the input stream - - self._prepared_stream_iterator = input_stream.iter_packets() - - # Packet-level caching (from your PodStream) - self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None - self._cached_output_table: pa.Table | None = None - self._cached_content_hash_column: pa.Array | None = None - - def set_mode(self, mode: str) -> None: - return self.pod.set_mode(mode) - - @property - def mode(self) -> str: - return self.pod.mode - - def test(self) -> cp.Stream: - return self - - async def run_async( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: - """ - Runs the stream, processing the input stream and preparing the output stream. - This is typically called before iterating over the packets. - """ - if self._cached_output_packets is None: - cached_results = [] - - # identify all entries in the input stream for which we still have not computed packets - target_entries = self.input_stream.as_table( - include_content_hash=constants.INPUT_PACKET_HASH, - include_source=True, - include_system_tags=True, - ) - existing_entries = self.pod.get_all_cached_outputs( - include_system_columns=True - ) - if existing_entries is None or existing_entries.num_rows == 0: - missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) - existing = None - else: - all_results = target_entries.join( - existing_entries.append_column( - "_exists", pa.array([True] * len(existing_entries)) - ), - keys=[constants.INPUT_PACKET_HASH], - join_type="left outer", - right_suffix="_right", - ) - # grab all columns from target_entries first - missing = ( - all_results.filter(pc.is_null(pc.field("_exists"))) - .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH]) - ) - - existing = ( - all_results.filter(pc.is_valid(pc.field("_exists"))) - .drop_columns(target_entries.column_names) - .drop_columns(["_exists"]) - ) - renamed = [ - c.removesuffix("_right") if c.endswith("_right") else c - for c in existing.column_names - ] - existing = existing.rename_columns(renamed) - - tag_keys = self.input_stream.keys()[0] - - if existing is not None and existing.num_rows > 0: - # If there are existing entries, we can cache them - existing_stream = TableStream(existing, tag_columns=tag_keys) - for tag, packet in existing_stream.iter_packets(): - cached_results.append((tag, packet)) - - pending_calls = [] - if missing is not None and missing.num_rows > 0: - for tag, packet in TableStream(missing, tag_columns=tag_keys): - # Since these packets are known to be missing, skip the cache lookup - pending = self.pod.async_call( - tag, - packet, - skip_cache_lookup=True, - execution_engine=execution_engine, - ) - pending_calls.append(pending) - import asyncio - - completed_calls = await asyncio.gather(*pending_calls) - for result in completed_calls: - cached_results.append(result) - - self._cached_output_packets = cached_results - self._set_modified_time() - - def run( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: - cached_results = [] - - # identify all entries in the input stream for which we still have not computed packets - target_entries = self.input_stream.as_table( - include_system_tags=True, - include_source=True, - include_content_hash=constants.INPUT_PACKET_HASH, - execution_engine=execution_engine, - ) - existing_entries = self.pod.get_all_cached_outputs(include_system_columns=True) - if ( - existing_entries is None - or existing_entries.num_rows == 0 - or self.mode == "development" - ): - missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) - existing = None - else: - # TODO: do more proper replacement operation - target_df = pl.DataFrame(target_entries) - existing_df = pl.DataFrame( - existing_entries.append_column( - "_exists", pa.array([True] * len(existing_entries)) - ) - ) - all_results_df = target_df.join( - existing_df, - on=constants.INPUT_PACKET_HASH, - how="left", - suffix="_right", - ) - all_results = all_results_df.to_arrow() - - missing = ( - all_results.filter(pc.is_null(pc.field("_exists"))) - .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH]) - ) - - existing = all_results.filter( - pc.is_valid(pc.field("_exists")) - ).drop_columns( - [ - "_exists", - constants.INPUT_PACKET_HASH, - constants.PACKET_RECORD_ID, - *self.input_stream.keys()[1], # remove the input packet keys - ] - # TODO: look into NOT fetching back the record ID - ) - renamed = [ - c.removesuffix("_right") if c.endswith("_right") else c - for c in existing.column_names - ] - existing = existing.rename_columns(renamed) - - tag_keys = self.input_stream.keys()[0] - - if existing is not None and existing.num_rows > 0: - # If there are existing entries, we can cache them - existing_stream = TableStream(existing, tag_columns=tag_keys) - for tag, packet in existing_stream.iter_packets(): - cached_results.append((tag, packet)) - - if missing is not None and missing.num_rows > 0: - hash_to_output_lut: dict[str, cp.Packet | None] = {} - for tag, packet in TableStream(missing, tag_columns=tag_keys): - # Since these packets are known to be missing, skip the cache lookup - packet_hash = packet.content_hash().to_string() - if packet_hash in hash_to_output_lut: - output_packet = hash_to_output_lut[packet_hash] - else: - tag, output_packet = self.pod.call( - tag, - packet, - skip_cache_lookup=True, - execution_engine=execution_engine, - ) - hash_to_output_lut[packet_hash] = output_packet - cached_results.append((tag, output_packet)) - - self._cached_output_packets = cached_results - self._set_modified_time() - - def iter_packets( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - """ - Processes the input stream and prepares the output stream. - This is typically called before iterating over the packets. - """ - if self._cached_output_packets is None: - cached_results = [] - - # identify all entries in the input stream for which we still have not computed packets - target_entries = self.input_stream.as_table( - include_system_tags=True, - include_source=True, - include_content_hash=constants.INPUT_PACKET_HASH, - execution_engine=execution_engine, - ) - existing_entries = self.pod.get_all_cached_outputs( - include_system_columns=True - ) - if existing_entries is None or existing_entries.num_rows == 0: - missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) - existing = None - else: - # missing = target_entries.join( - # existing_entries, - # keys=[constants.INPUT_PACKET_HASH], - # join_type="left anti", - # ) - # Single join that gives you both missing and existing - # More efficient - only bring the key column from existing_entries - # .select([constants.INPUT_PACKET_HASH]).append_column( - # "_exists", pa.array([True] * len(existing_entries)) - # ), - - # TODO: do more proper replacement operation - target_df = pl.DataFrame(target_entries) - existing_df = pl.DataFrame( - existing_entries.append_column( - "_exists", pa.array([True] * len(existing_entries)) - ) - ) - all_results_df = target_df.join( - existing_df, - on=constants.INPUT_PACKET_HASH, - how="left", - suffix="_right", - ) - all_results = all_results_df.to_arrow() - # all_results = target_entries.join( - # existing_entries.append_column( - # "_exists", pa.array([True] * len(existing_entries)) - # ), - # keys=[constants.INPUT_PACKET_HASH], - # join_type="left outer", - # right_suffix="_right", # rename the existing records in case of collision of output packet keys with input packet keys - # ) - # grab all columns from target_entries first - missing = ( - all_results.filter(pc.is_null(pc.field("_exists"))) - .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH]) - ) - - existing = all_results.filter( - pc.is_valid(pc.field("_exists")) - ).drop_columns( - [ - "_exists", - constants.INPUT_PACKET_HASH, - constants.PACKET_RECORD_ID, - *self.input_stream.keys()[1], # remove the input packet keys - ] - # TODO: look into NOT fetching back the record ID - ) - renamed = [ - c.removesuffix("_right") if c.endswith("_right") else c - for c in existing.column_names - ] - existing = existing.rename_columns(renamed) - - tag_keys = self.input_stream.keys()[0] - - if existing is not None and existing.num_rows > 0: - # If there are existing entries, we can cache them - existing_stream = TableStream(existing, tag_columns=tag_keys) - for tag, packet in existing_stream.iter_packets(): - cached_results.append((tag, packet)) - yield tag, packet - - if missing is not None and missing.num_rows > 0: - hash_to_output_lut: dict[str, cp.Packet | None] = {} - for tag, packet in TableStream(missing, tag_columns=tag_keys): - # Since these packets are known to be missing, skip the cache lookup - packet_hash = packet.content_hash().to_string() - if packet_hash in hash_to_output_lut: - output_packet = hash_to_output_lut[packet_hash] - else: - tag, output_packet = self.pod.call( - tag, - packet, - skip_cache_lookup=True, - execution_engine=execution_engine, - ) - hash_to_output_lut[packet_hash] = output_packet - cached_results.append((tag, output_packet)) - if output_packet is not None: - yield tag, output_packet - - self._cached_output_packets = cached_results - self._set_modified_time() - else: - for tag, packet in self._cached_output_packets: - if packet is not None: - yield tag, packet - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - - tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) - packet_keys = tuple(self.pod.output_packet_types().keys()) - return tag_keys, packet_keys - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, _ = self.input_stream.types( - include_system_tags=include_system_tags - ) - # TODO: check if copying can be avoided - packet_typespec = dict(self.pod.output_packet_types()) - return tag_typespec, packet_typespec - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - if self._cached_output_table is None: - all_tags = [] - all_packets = [] - tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets(execution_engine=execution_engine): - if tag_schema is None: - tag_schema = tag.arrow_schema(include_system_tags=True) - if packet_schema is None: - packet_schema = packet.arrow_schema( - include_context=True, - include_source=True, - ) - all_tags.append(tag.as_dict(include_system_tags=True)) - # FIXME: using in the pinch conversion to str from path - # replace with an appropriate semantic converter-based approach! - dict_patcket = packet.as_dict(include_context=True, include_source=True) - all_packets.append(dict_patcket) - - converter = self.data_context.type_converter - - struct_packets = converter.python_dicts_to_struct_dicts(all_packets) - all_tags_as_tables: pa.Table = pa.Table.from_pylist( - all_tags, schema=tag_schema - ) - all_packets_as_tables: pa.Table = pa.Table.from_pylist( - struct_packets, schema=packet_schema - ) - - self._cached_output_table = arrow_utils.hstack_tables( - all_tags_as_tables, all_packets_as_tables - ) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." - ) - - drop_columns = [] - if not include_source: - drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) - if not include_data_context: - drop_columns.append(constants.CONTEXT_KEY) - if not include_system_tags: - # TODO: come up with a more efficient approach - drop_columns.extend( - [ - c - for c in self._cached_output_table.column_names - if c.startswith(constants.SYSTEM_TAG_PREFIX) - ] - ) - - output_table = self._cached_output_table.drop_columns(drop_columns) - - # lazily prepare content hash column if requested - if include_content_hash: - if self._cached_content_hash_column is None: - content_hashes = [] - for tag, packet in self.iter_packets(execution_engine=execution_engine): - content_hashes.append(packet.content_hash().to_string()) - self._cached_content_hash_column = pa.array( - content_hashes, type=pa.large_string() - ) - assert self._cached_content_hash_column is not None, ( - "_cached_content_hash_column should not be None here." - ) - hash_column_name = ( - "_content_hash" - if include_content_hash is True - else include_content_hash - ) - output_table = output_table.append_column( - hash_column_name, self._cached_content_hash_column - ) - - if sort_by_tags: - try: - # TODO: consider having explicit tag/packet properties? - output_table = output_table.sort_by( - [(column, "ascending") for column in self.keys()[0]] - ) - except pa.ArrowTypeError: - pass - - return output_table diff --git a/src/orcapod/core/streams/kernel_stream.py b/src/orcapod/core/streams/kernel_stream.py deleted file mode 100644 index c3850a5..0000000 --- a/src/orcapod/core/streams/kernel_stream.py +++ /dev/null @@ -1,199 +0,0 @@ -import logging -from collections.abc import Iterator -from datetime import datetime -from typing import TYPE_CHECKING, Any - -from orcapod.protocols import core_protocols as cp -from orcapod.types import PythonSchema -from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import StreamBase - - -if TYPE_CHECKING: - import pyarrow as pa - import pyarrow.compute as pc - import polars as pl - import pandas as pd - import asyncio -else: - pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") - pl = LazyModule("polars") - pd = LazyModule("pandas") - asyncio = LazyModule("asyncio") - - -# TODO: consider using this instead of making copy of dicts -# from types import MappingProxyType - -logger = logging.getLogger(__name__) - - -class KernelStream(StreamBase): - """ - Recomputable stream that wraps a stream produced by a kernel to provide - an abstraction over the stream, taking the stream's source and upstreams as the basis of - recomputing the stream. - - This stream is used to represent the output of a kernel invocation. - """ - - def __init__( - self, - output_stream: cp.Stream | None = None, - source: cp.Kernel | None = None, - upstreams: tuple[ - cp.Stream, ... - ] = (), # if provided, this will override the upstreams of the output_stream - **kwargs, - ) -> None: - if (output_stream is None or output_stream.source is None) and source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - if source is None: - if output_stream is None or output_stream.source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - source = output_stream.source - upstreams = upstreams or output_stream.upstreams - - super().__init__(source=source, upstreams=upstreams, **kwargs) - self.kernel = source - self._cached_stream = output_stream - - def clear_cache(self) -> None: - """ - Clears the cached stream. - This is useful for re-processing the stream with the same kernel. - """ - self._cached_stream = None - self._set_modified_time(invalidate=True) - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - tag_types, packet_types = self.kernel.output_types( - *self.upstreams, include_system_tags=include_system_tags - ) - return tuple(tag_types.keys()), tuple(packet_types.keys()) - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - return self.kernel.output_types( - *self.upstreams, include_system_tags=include_system_tags - ) - - @property - def is_current(self) -> bool: - if self._cached_stream is None or not super().is_current: - status = self.refresh() - if not status: # if it failed to update for whatever reason - return False - return True - - def refresh(self, force: bool = False) -> bool: - updated = False - if force or (self._cached_stream is not None and not super().is_current): - self.clear_cache() - - if self._cached_stream is None: - assert self.source is not None, ( - "Stream source must be set to recompute the stream." - ) - self._cached_stream = self.source.forward(*self.upstreams) - self._set_modified_time() - updated = True - - if self._cached_stream is None: - # TODO: use beter error type - raise ValueError( - "Stream could not be updated. Ensure that the source is valid and upstreams are correct." - ) - - return updated - - def invalidate(self) -> None: - """ - Invalidate the stream, marking it as needing recomputation. - This will clear the cached stream and set the last modified time to None. - """ - self.clear_cache() - self._set_modified_time(invalidate=True) - - @property - def last_modified(self) -> datetime | None: - if self._cached_stream is None: - return None - return self._cached_stream.last_modified - - def run( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - self._cached_stream.run(*args, execution_engine=execution_engine, **kwargs) - - async def run_async( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - await self._cached_stream.run_async( - *args, execution_engine=execution_engine, **kwargs - ) - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - return self._cached_stream.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - - def iter_packets( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - return self._cached_stream.iter_packets(execution_engine=execution_engine) - - def __repr__(self) -> str: - return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" diff --git a/src/orcapod/core/streams/lazy_pod_stream.py b/src/orcapod/core/streams/lazy_pod_stream.py deleted file mode 100644 index 9eefc83..0000000 --- a/src/orcapod/core/streams/lazy_pod_stream.py +++ /dev/null @@ -1,232 +0,0 @@ -import logging -from collections.abc import Iterator -from pathlib import Path -from typing import TYPE_CHECKING, Any - -from orcapod.core.system_constants import constants -from orcapod.protocols import core_protocols as cp -from orcapod.types import PythonSchema -from orcapod.utils import arrow_utils -from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import StreamBase - - -if TYPE_CHECKING: - import pyarrow as pa - import polars as pl - import asyncio -else: - pa = LazyModule("pyarrow") - pl = LazyModule("polars") - asyncio = LazyModule("asyncio") - - -# TODO: consider using this instead of making copy of dicts -# from types import MappingProxyType - -logger = logging.getLogger(__name__) - - -class LazyPodResultStream(StreamBase): - """ - A fixed stream that lazily processes packets from a prepared input stream. - This is what Pod.process() returns - it's static/fixed but efficient. - """ - - def __init__(self, pod: cp.Pod, prepared_stream: cp.Stream, **kwargs): - super().__init__(source=pod, upstreams=(prepared_stream,), **kwargs) - self.pod = pod - self.prepared_stream = prepared_stream - # capture the immutable iterator from the prepared stream - self._prepared_stream_iterator = prepared_stream.iter_packets() - self._set_modified_time() # set modified time to AFTER we obtain the iterator - # note that the invocation of iter_packets on upstream likely triggeres the modified time - # to be updated on the usptream. Hence you want to set this stream's modified time after that. - - # Packet-level caching (from your PodStream) - self._cached_output_packets: dict[int, tuple[cp.Tag, cp.Packet | None]] = {} - self._cached_output_table: pa.Table | None = None - self._cached_content_hash_column: pa.Array | None = None - - def iter_packets( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - if self._prepared_stream_iterator is not None: - for i, (tag, packet) in enumerate(self._prepared_stream_iterator): - if i in self._cached_output_packets: - # Use cached result - tag, packet = self._cached_output_packets[i] - if packet is not None: - yield tag, packet - else: - # Process packet - processed = self.pod.call( - tag, packet, execution_engine=execution_engine - ) - if processed is not None: - # Update shared cache for future iterators (optimization) - self._cached_output_packets[i] = processed - tag, packet = processed - if packet is not None: - yield tag, packet - - # Mark completion by releasing the iterator - self._prepared_stream_iterator = None - else: - # Yield from snapshot of complete cache - for i in range(len(self._cached_output_packets)): - tag, packet = self._cached_output_packets[i] - if packet is not None: - yield tag, packet - - async def run_async( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: - if self._prepared_stream_iterator is not None: - pending_call_lut = {} - for i, (tag, packet) in enumerate(self._prepared_stream_iterator): - if i not in self._cached_output_packets: - # Process packet - pending_call_lut[i] = self.pod.async_call( - tag, packet, execution_engine=execution_engine - ) - - indices = list(pending_call_lut.keys()) - pending_calls = [pending_call_lut[i] for i in indices] - - results = await asyncio.gather(*pending_calls) - for i, result in zip(indices, results): - self._cached_output_packets[i] = result - - # Mark completion by releasing the iterator - self._prepared_stream_iterator = None - - def run( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any - ) -> None: - # Fallback to synchronous run - self.flow(execution_engine=execution_engine) - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - - tag_keys, _ = self.prepared_stream.keys(include_system_tags=include_system_tags) - packet_keys = tuple(self.pod.output_packet_types().keys()) - return tag_keys, packet_keys - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, _ = self.prepared_stream.types( - include_system_tags=include_system_tags - ) - # TODO: check if copying can be avoided - packet_typespec = dict(self.pod.output_packet_types()) - return tag_typespec, packet_typespec - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - if self._cached_output_table is None: - all_tags = [] - all_packets = [] - tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets(execution_engine=execution_engine): - if tag_schema is None: - tag_schema = tag.arrow_schema(include_system_tags=True) - if packet_schema is None: - packet_schema = packet.arrow_schema( - include_context=True, - include_source=True, - ) - all_tags.append(tag.as_dict(include_system_tags=True)) - # FIXME: using in the pinch conversion to str from path - # replace with an appropriate semantic converter-based approach! - dict_patcket = packet.as_dict(include_context=True, include_source=True) - all_packets.append(dict_patcket) - - # TODO: re-verify the implemetation of this conversion - converter = self.data_context.type_converter - - struct_packets = converter.python_dicts_to_struct_dicts(all_packets) - all_tags_as_tables: pa.Table = pa.Table.from_pylist( - all_tags, schema=tag_schema - ) - all_packets_as_tables: pa.Table = pa.Table.from_pylist( - struct_packets, schema=packet_schema - ) - - self._cached_output_table = arrow_utils.hstack_tables( - all_tags_as_tables, all_packets_as_tables - ) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." - ) - - drop_columns = [] - if not include_system_tags: - # TODO: get system tags more effiicently - drop_columns.extend( - [ - c - for c in self._cached_output_table.column_names - if c.startswith(constants.SYSTEM_TAG_PREFIX) - ] - ) - if not include_source: - drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) - if not include_data_context: - drop_columns.append(constants.CONTEXT_KEY) - - output_table = self._cached_output_table.drop(drop_columns) - - # lazily prepare content hash column if requested - if include_content_hash: - if self._cached_content_hash_column is None: - content_hashes = [] - # TODO: verify that order will be preserved - for tag, packet in self.iter_packets(): - content_hashes.append(packet.content_hash().to_string()) - self._cached_content_hash_column = pa.array( - content_hashes, type=pa.large_string() - ) - assert self._cached_content_hash_column is not None, ( - "_cached_content_hash_column should not be None here." - ) - hash_column_name = ( - "_content_hash" - if include_content_hash is True - else include_content_hash - ) - output_table = output_table.append_column( - hash_column_name, self._cached_content_hash_column - ) - - if sort_by_tags: - # TODO: reimplement using polars natively - output_table = ( - pl.DataFrame(output_table) - .sort(by=self.keys()[0], descending=False) - .to_arrow() - ) - # output_table = output_table.sort_by( - # [(column, "ascending") for column in self.keys()[0]] - # ) - return output_table diff --git a/src/orcapod/core/streams/pod_node_stream.py b/src/orcapod/core/streams/pod_node_stream.py index b6ef449..affe4b7 100644 --- a/src/orcapod/core/streams/pod_node_stream.py +++ b/src/orcapod/core/streams/pod_node_stream.py @@ -2,19 +2,20 @@ from collections.abc import Iterator from typing import TYPE_CHECKING, Any -from orcapod.core.system_constants import constants -from orcapod.protocols import core_protocols as cp, pipeline_protocols as pp +import orcapod.protocols.core_protocols.execution_engine +from orcapod.contexts.system_constants import constants +from orcapod.core.streams.base import StreamBase +from orcapod.core.streams.table_stream import TableStream +from orcapod.protocols import core_protocols as cp +from orcapod.protocols import pipeline_protocols as pp from orcapod.types import PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import StreamBase -from orcapod.core.streams.table_stream import TableStream - if TYPE_CHECKING: + import polars as pl import pyarrow as pa import pyarrow.compute as pc - import polars as pl else: pa = LazyModule("pyarrow") @@ -56,7 +57,9 @@ def mode(self) -> str: return self.pod_node.mode async def run_async( - self, execution_engine: cp.ExecutionEngine | None = None + self, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> None: """ Runs the stream, processing the input stream and preparing the output stream. @@ -135,7 +138,8 @@ async def run_async( def run( self, *args: Any, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, **kwargs: Any, ) -> None: cached_results = [] @@ -254,7 +258,9 @@ def clear_cache(self) -> None: self._cached_content_hash_column = None def iter_packets( - self, execution_engine: cp.ExecutionEngine | None = None + self, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """ Processes the input stream and prepares the output stream. @@ -421,7 +427,8 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, ) -> "pa.Table": if self._cached_output_table is None: all_tags = [] diff --git a/src/orcapod/core/streams/table_stream.py b/src/orcapod/core/streams/table_stream.py index a71ea5f..1581ec5 100644 --- a/src/orcapod/core/streams/table_stream.py +++ b/src/orcapod/core/streams/table_stream.py @@ -10,27 +10,26 @@ DictTag, ) from orcapod.core.system_constants import constants -from orcapod.protocols import core_protocols as cp +from orcapod.protocols.core_protocols import Pod, Tag, Packet, Stream, ColumnConfig + from orcapod.types import PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import ImmutableStream +from orcapod.core.streams.base import StreamBase if TYPE_CHECKING: import pyarrow as pa - import pyarrow.compute as pc import polars as pl import pandas as pd else: pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") pl = LazyModule("polars") pd = LazyModule("pandas") logger = logging.getLogger(__name__) -class TableStream(ImmutableStream): +class TableStream(StreamBase): """ An immutable stream based on a PyArrow Table. This stream is designed to be used with data that is already in a tabular format, @@ -48,11 +47,14 @@ def __init__( tag_columns: Collection[str] = (), system_tag_columns: Collection[str] = (), source_info: dict[str, str | None] | None = None, - source: cp.Kernel | None = None, - upstreams: tuple[cp.Stream, ...] = (), + source: Pod | None = None, + upstreams: tuple[Stream, ...] = (), **kwargs, ) -> None: - super().__init__(source=source, upstreams=upstreams, **kwargs) + super().__init__(**kwargs) + + self._source = source + self._upstreams = upstreams data_table, data_context_table = arrow_utils.split_by_column_groups( table, [constants.CONTEXT_KEY] @@ -143,47 +145,67 @@ def __init__( # ) # ) - self._cached_elements: list[tuple[cp.Tag, ArrowPacket]] | None = None - self._set_modified_time() # set modified time to now + self._cached_elements: list[tuple[Tag, ArrowPacket]] | None = None + self._update_modified_time() # set modified time to now - def data_content_identity_structure(self) -> Any: + def identity_structure(self) -> Any: """ Returns a hash of the content of the stream. This is used to identify the content of the stream. """ - table_hash = self.data_context.arrow_hasher.hash_table( - self.as_table( - include_data_context=True, include_source=True, include_system_tags=True - ), - ) - return ( - self.__class__.__name__, - table_hash, - self._tag_columns, - ) + if self.source is None: + table_hash = self.data_context.arrow_hasher.hash_table( + self.as_table( + all_info=True, + ), + ) + return ( + self.__class__.__name__, + table_hash, + self._tag_columns, + ) + return super().identity_structure() + + @property + def source(self) -> Pod | None: + return self._source + + @property + def upstreams(self) -> tuple[Stream, ...]: + return self._upstreams def keys( - self, include_system_tags: bool = False + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ tag_columns = self._tag_columns - if include_system_tags: + columns_config = ColumnConfig.handle_config(columns, all_info=all_info) + # TODO: add standard parsing of columns + if columns_config.system_tags: tag_columns += self._system_tag_columns return tag_columns, self._packet_columns - def types( - self, include_system_tags: bool = False + def output_schema( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ + # normalize column config + columns_config = ColumnConfig.handle_config(columns, all_info=all_info) # TODO: consider using MappingProxyType to avoid copying the dicts converter = self.data_context.type_converter - if include_system_tags: + if columns_config.system_tags: tag_schema = self._all_tag_schema else: tag_schema = self._tag_schema @@ -194,23 +216,21 @@ def types( def as_table( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": """ Returns the underlying table representation of the stream. This is useful for converting the stream to a table format. """ + columns_config = ColumnConfig.handle_config(columns, all_info=all_info) output_table = self._table - if include_content_hash: + if columns_config.content_hash: hash_column_name = ( "_content_hash" - if include_content_hash is True - else include_content_hash + if columns_config.content_hash is True + else columns_config.content_hash ) content_hashes = [ str(packet.content_hash()) for _, packet in self.iter_packets() @@ -218,22 +238,24 @@ def as_table( output_table = output_table.append_column( hash_column_name, pa.array(content_hashes, type=pa.large_string()) ) - if not include_system_tags: + if not columns_config.system_tags: # Check in original implementation output_table = output_table.drop_columns(list(self._system_tag_columns)) table_stack = (output_table,) - if include_data_context: + if columns_config.context: table_stack += (self._data_context_table,) - if include_source: + if columns_config.source: table_stack += (self._source_info_table,) table = arrow_utils.hstack_tables(*table_stack) - if sort_by_tags: + if columns_config.sort_by_tags: # TODO: cleanup the sorting tag selection logic try: target_tags = ( - self._all_tag_columns if include_system_tags else self._tag_columns + self._all_tag_columns + if columns_config.system_tags + else self._tag_columns ) return table.sort_by([(column, "ascending") for column in target_tags]) except pa.ArrowTypeError: @@ -249,9 +271,7 @@ def clear_cache(self) -> None: """ self._cached_elements = None - def iter_packets( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Iterator[tuple[cp.Tag, ArrowPacket]]: + def iter_packets(self) -> Iterator[tuple[Tag, ArrowPacket]]: """ Iterates over the packets in the stream. Each packet is represented as a tuple of (Tag, Packet). @@ -294,32 +314,6 @@ def iter_packets( else: yield from self._cached_elements - def run( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: - """ - Runs the stream, which in this case is a no-op since the stream is immutable. - This is typically used to trigger any upstream computation of the stream. - """ - # No-op for immutable streams - pass - - async def run_async( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs: Any, - ) -> None: - """ - Runs the stream asynchronously, which in this case is a no-op since the stream is immutable. - This is typically used to trigger any upstream computation of the stream. - """ - # No-op for immutable streams - pass - def __repr__(self) -> str: return ( f"{self.__class__.__name__}(table={self._table.column_names}, " diff --git a/src/orcapod/core/streams/wrapped_stream.py b/src/orcapod/core/streams/wrapped_stream.py deleted file mode 100644 index 3f14203..0000000 --- a/src/orcapod/core/streams/wrapped_stream.py +++ /dev/null @@ -1,86 +0,0 @@ -import logging -from collections.abc import Iterator -from typing import TYPE_CHECKING, Any - -from orcapod.protocols import core_protocols as cp -from orcapod.types import PythonSchema -from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import StreamBase - - -if TYPE_CHECKING: - import pyarrow as pa -else: - pa = LazyModule("pyarrow") - - -# TODO: consider using this instead of making copy of dicts -# from types import MappingProxyType - -logger = logging.getLogger(__name__) - - -class WrappedStream(StreamBase): - def __init__( - self, - stream: cp.Stream, - source: cp.Kernel, - input_streams: tuple[cp.Stream, ...], - label: str | None = None, - **kwargs, - ) -> None: - super().__init__(source=source, upstreams=input_streams, label=label, **kwargs) - self._stream = stream - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - return self._stream.keys(include_system_tags=include_system_tags) - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - return self._stream.types(include_system_tags=include_system_tags) - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - """ - Returns the underlying table representation of the stream. - This is useful for converting the stream to a table format. - """ - return self._stream.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - - def iter_packets( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - """ - Iterates over the packets in the stream. - Each packet is represented as a tuple of (Tag, Packet). - """ - return self._stream.iter_packets(execution_engine=execution_engine) - - def identity_structure(self) -> Any: - return self._stream.identity_structure() diff --git a/src/orcapod/core/trackers.py b/src/orcapod/core/tracker.py similarity index 89% rename from src/orcapod/core/trackers.py rename to src/orcapod/core/tracker.py index 4ffe39a..2a78ae7 100644 --- a/src/orcapod/core/trackers.py +++ b/src/orcapod/core/tracker.py @@ -1,11 +1,11 @@ -from orcapod.core.base import LabeledContentIdentifiableBase -from orcapod.protocols import core_protocols as cp +from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Generator -from abc import ABC, abstractmethod -from typing import Any, TYPE_CHECKING from contextlib import contextmanager +from typing import TYPE_CHECKING, Any +from orcapod.core.base import OrcapodBase +from orcapod.protocols import core_protocols as cp if TYPE_CHECKING: import networkx as nx @@ -50,38 +50,43 @@ def get_active_trackers(self) -> list[cp.Tracker]: # This is to ensure that we only return trackers that are currently active return [t for t in self._active_trackers if t.is_active()] - def record_kernel_invocation( + def record_pod_invocation( self, - kernel: cp.Kernel, + pod: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: """ - Record the output stream of a kernel invocation in the tracker. - This is used to track the computational graph and the invocations of kernels. + Record the output stream of a pod invocation in the tracker. + This is used to track the computational graph and the invocations of pods. """ for tracker in self.get_active_trackers(): - tracker.record_kernel_invocation(kernel, upstreams, label=label) + tracker.record_pod_invocation(pod, upstreams, label=label) - def record_source_invocation( - self, source: cp.Source, label: str | None = None + def record_source_pod_invocation( + self, source_pod: cp.SourcePod, label: str | None = None ) -> None: """ Record the output stream of a source invocation in the tracker. This is used to track the computational graph and the invocations of sources. """ for tracker in self.get_active_trackers(): - tracker.record_source_invocation(source, label=label) + tracker.record_source_pod_invocation(source_pod, label=label) - def record_pod_invocation( - self, pod: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None + def record_packet_function_invocation( + self, + packet_function: cp.PacketFunction, + input_stream: cp.Stream, + label: str | None = None, ) -> None: """ Record the output stream of a pod invocation in the tracker. This is used to track the computational graph and the invocations of pods. """ for tracker in self.get_active_trackers(): - tracker.record_pod_invocation(pod, upstreams, label=label) + tracker.record_packet_function_invocation( + packet_function, input_stream, label=label + ) @contextmanager def no_tracking(self) -> Generator[None, Any, None]: @@ -111,14 +116,14 @@ def is_active(self) -> bool: @abstractmethod def record_kernel_invocation( self, - kernel: cp.Kernel, + kernel: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: ... @abstractmethod def record_source_invocation( - self, source: cp.Source, label: str | None = None + self, source: cp.SourcePod, label: str | None = None ) -> None: ... @abstractmethod @@ -134,10 +139,10 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) -class Invocation(LabeledContentIdentifiableBase): +class Invocation(OrcapodBase): def __init__( self, - kernel: cp.Kernel, + kernel: cp.Pod, upstreams: tuple[cp.Stream, ...] = (), label: str | None = None, ) -> None: @@ -204,11 +209,11 @@ def __init__( # This is used to track the computational graph and the invocations of kernels self.kernel_invocations: set[Invocation] = set() self.invocation_to_pod_lut: dict[Invocation, cp.Pod] = {} - self.invocation_to_source_lut: dict[Invocation, cp.Source] = {} + self.invocation_to_source_lut: dict[Invocation, cp.SourcePod] = {} def _record_kernel_and_get_invocation( self, - kernel: cp.Kernel, + kernel: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> Invocation: @@ -218,7 +223,7 @@ def _record_kernel_and_get_invocation( def record_kernel_invocation( self, - kernel: cp.Kernel, + kernel: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: @@ -229,7 +234,7 @@ def record_kernel_invocation( self._record_kernel_and_get_invocation(kernel, upstreams, label) def record_source_invocation( - self, source: cp.Source, label: str | None = None + self, source: cp.SourcePod, label: str | None = None ) -> None: """ Record the output stream of a source invocation in the tracker. @@ -246,7 +251,7 @@ def record_pod_invocation( invocation = self._record_kernel_and_get_invocation(pod, upstreams, label) self.invocation_to_pod_lut[invocation] = pod - def reset(self) -> dict[cp.Kernel, list[cp.Stream]]: + def reset(self) -> dict[cp.Pod, list[cp.Stream]]: """ Reset the tracker and return the recorded invocations. """ diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 8576f83..71e71a2 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -1,14 +1,15 @@ import hashlib +import json +from collections.abc import Callable from typing import Any + import pyarrow as pa -import json -from orcapod.semantic_types import SemanticTypeRegistry + from orcapod.hashing import arrow_serialization -from collections.abc import Callable from orcapod.hashing.visitors import SemanticHashingVisitor -from orcapod.utils import arrow_utils from orcapod.protocols.hashing_protocols import ContentHash - +from orcapod.semantic_types import SemanticTypeRegistry +from orcapod.utils import arrow_utils SERIALIZATION_METHOD_LUT: dict[str, Callable[[pa.Table], bytes]] = { "logical": arrow_serialization.serialize_table_logical, diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index ddb7422..26eb4ac 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -1,5 +1,6 @@ -from orcapod.core.trackers import GraphTracker, Invocation +from orcapod.core.tracker import GraphTracker, Invocation from orcapod.pipeline.nodes import KernelNode, PodNode +import orcapod.protocols.core_protocols.execution_engine from orcapod.protocols.pipeline_protocols import Node from orcapod import contexts from orcapod.protocols import core_protocols as cp @@ -178,7 +179,8 @@ def set_mode(self, mode: str) -> None: def run( self, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, run_async: bool | None = None, ) -> None: """Execute the pipeline by running all nodes in the graph. diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 08cd2ed..f05cdea 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,13 +1,14 @@ from abc import abstractmethod from orcapod.core.datagrams import ArrowTag -from orcapod.core.kernels import KernelStream, WrappedKernel +from orcapod.core.pod import KernelStream, WrappedKernel from orcapod.core.sources.base import SourceBase, InvocationBase -from orcapod.core.pods import CachedPod +from orcapod.core.packet_function import CachedPod from orcapod.protocols import core_protocols as cp, database_protocols as dbp +import orcapod.protocols.core_protocols.execution_engine from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING, Any -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants from orcapod.utils import arrow_utils from collections.abc import Collection from orcapod.core.streams import PodNodeStream @@ -301,7 +302,8 @@ def call( tag: cp.Tag, packet: cp.Packet, record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[cp.Tag, cp.Packet | None]: @@ -338,7 +340,8 @@ async def async_call( tag: cp.Tag, packet: cp.Packet, record_id: str | None = None, - execution_engine: cp.ExecutionEngine | None = None, + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine + | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[cp.Tag, cp.Packet | None]: diff --git a/src/orcapod/protocols/core_protocols/__init__.py b/src/orcapod/protocols/core_protocols/__init__.py index f9c711d..62e9b0c 100644 --- a/src/orcapod/protocols/core_protocols/__init__.py +++ b/src/orcapod/protocols/core_protocols/__init__.py @@ -1,24 +1,22 @@ -from .base import ExecutionEngine, PodFunction -from .datagrams import Datagram, Tag, Packet -from .streams import Stream, LiveStream -from .kernel import Kernel -from .pods import Pod, CachedPod -from .source import Source +from .datagrams import ColumnConfig, Datagram, Packet, Tag +from .operator_pod import OperatorPod +from .packet_function import PacketFunction +from .pod import ArgumentGroup, Pod +from .source_pod import SourcePod +from .streams import Stream from .trackers import Tracker, TrackerManager - __all__ = [ - "ExecutionEngine", - "PodFunction", + "ColumnConfig", "Datagram", "Tag", "Packet", "Stream", - "LiveStream", - "Kernel", "Pod", - "CachedPod", - "Source", + "ArgumentGroup", + "SourcePod", + "OperatorPod", + "PacketFunction", "Tracker", "TrackerManager", ] diff --git a/src/orcapod/protocols/core_protocols/base.py b/src/orcapod/protocols/core_protocols/base.py deleted file mode 100644 index c44d52c..0000000 --- a/src/orcapod/protocols/core_protocols/base.py +++ /dev/null @@ -1,110 +0,0 @@ -from collections.abc import Callable -from typing import Any, Protocol, runtime_checkable -from orcapod.types import DataValue - - -@runtime_checkable -class ExecutionEngine(Protocol): - @property - def name(self) -> str: ... - - def submit_sync(self, function: Callable, *args, **kwargs) -> Any: - """ - Run the given function with the provided arguments. - This method should be implemented by the execution engine. - """ - ... - - async def submit_async(self, function: Callable, *args, **kwargs) -> Any: - """ - Asynchronously run the given function with the provided arguments. - This method should be implemented by the execution engine. - """ - ... - - # TODO: consider adding batch submission - - -@runtime_checkable -class PodFunction(Protocol): - """ - A function suitable for use in a FunctionPod. - - PodFunctions define the computational logic that operates on individual - packets within a Pod. They represent pure functions that transform - data values without side effects. - - These functions are designed to be: - - Stateless: No dependency on external state - - Deterministic: Same inputs always produce same outputs - - Serializable: Can be cached and distributed - - Type-safe: Clear input/output contracts - - PodFunctions accept named arguments corresponding to packet fields - and return transformed data values. - """ - - def __call__(self, **kwargs: DataValue) -> None | DataValue: - """ - Execute the pod function with the given arguments. - - The function receives packet data as named arguments and returns - either transformed data or None (for filtering operations). - - Args: - **kwargs: Named arguments mapping packet fields to data values - - Returns: - None: Filter out this packet (don't include in output) - DataValue: Single transformed value - - Raises: - TypeError: If required arguments are missing - ValueError: If argument values are invalid - """ - ... - - -@runtime_checkable -class Labelable(Protocol): - """ - Protocol for objects that can have a human-readable label. - - Labels provide meaningful names for objects in the computational graph, - making debugging, visualization, and monitoring much easier. They serve - as human-friendly identifiers that complement the technical identifiers - used internally. - - Labels are optional but highly recommended for: - - Debugging complex computational graphs - - Visualization and monitoring tools - - Error messages and logging - - User interfaces and dashboards - """ - - @property - def label(self) -> str: - """ - Return the human-readable label for this object. - - Labels should be descriptive and help users understand the purpose - or role of the object in the computational graph. - - Returns: - str: Human-readable label for this object - None: No label is set (will use default naming) - """ - ... - - @label.setter - def label(self, label: str | None) -> None: - """ - Set the human-readable label for this object. - - Labels should be descriptive and help users understand the purpose - or role of the object in the computational graph. - - Args: - value (str): Human-readable label for this object - """ - ... diff --git a/src/orcapod/protocols/core_protocols/datagrams.py b/src/orcapod/protocols/core_protocols/datagrams.py index a0f24d8..de80d1d 100644 --- a/src/orcapod/protocols/core_protocols/datagrams.py +++ b/src/orcapod/protocols/core_protocols/datagrams.py @@ -1,13 +1,155 @@ from collections.abc import Collection, Iterator, Mapping -from typing import Any, Protocol, Self, TYPE_CHECKING, runtime_checkable -from orcapod.protocols.hashing_protocols import ContentIdentifiable -from orcapod.types import DataValue, PythonSchema +from dataclasses import dataclass +from typing import ( + TYPE_CHECKING, + Any, + Protocol, + Self, + TypeAlias, + runtime_checkable, +) +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.types import DataType, DataValue, PythonSchema if TYPE_CHECKING: import pyarrow as pa +class Schema(Mapping[str, DataType]): + """ + Abstract base class for schema representations in Orcapod. + + Provides methods to access schema information in various formats, + including Python type specifications and PyArrow schemas. + """ + + @classmethod + def from_arrow_schema(cls, arrow_schema: "pa.Schema") -> Self: + """ + Create Schema instance from PyArrow schema. + + Args: + arrow_schema: PyArrow Schema to convert. + """ + ... + + def to_arrow_schema(self) -> "pa.Schema": + """ + Return PyArrow schema representation. + + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. + + Returns: + PyArrow Schema describing the structure. + + Example: + >>> schema = schema.arrow_schema() + >>> schema.names + ['user_id', 'name'] + """ + ... + + +SchemaLike: TypeAlias = Mapping[str, DataType] + + +@dataclass(frozen=True) +class ColumnConfig: + """ + Configuration for column inclusion in Datagram/Packet/Tag operations. + + Controls which column types to include when converting to tables, dicts, + or querying keys/types. + + Attributes: + meta: Include meta columns (with '__' prefix). + - False: exclude all meta columns (default) + - True: include all meta columns + - Collection[str]: include specific meta columns by name + (prefix '__' is added automatically if not present) + context: Include context column + source: Include source info columns (Packet only, ignored for others) + system_tags: Include system tag columns (Tag only, ignored for others) + all_info: Include all available columns (overrides other settings) + + Examples: + >>> # Data columns only (default) + >>> ColumnConfig() + + >>> # Everything + >>> ColumnConfig(all_info=True) + >>> # Or use convenience method: + >>> ColumnConfig.all() + + >>> # Specific combinations + >>> ColumnConfig(meta=True, context=True) + >>> ColumnConfig(meta=["pipeline", "processed"], source=True) + + >>> # As dict (alternative syntax) + >>> {"meta": True, "source": True} + """ + + meta: bool | Collection[str] = False + context: bool = False + source: bool = False # Only relevant for Packet + system_tags: bool = False # Only relevant for Tag + content_hash: bool | str = False # Only relevant for Packet + sort_by_tags: bool = False # Only relevant for Tag + all_info: bool = False + + @classmethod + def all(cls) -> Self: + """Convenience: include all available columns""" + return cls( + meta=True, + context=True, + source=True, + system_tags=True, + content_hash=True, + sort_by_tags=True, + all_info=True, + ) + + @classmethod + def data_only(cls) -> Self: + """Convenience: include only data columns (default)""" + return cls() + + @classmethod + def handle_config( + cls, config: Self | dict[str, Any] | None, all_info: bool = False + ) -> Self: + """ + Normalize column configuration input. + + Args: + config: ColumnConfig instance or dict to normalize. + all_info: If True, override config to include all columns. + + Returns: + Normalized ColumnConfig instance. + """ + if all_info: + return cls.all() + # TODO: properly handle non-boolean values when using all_info + + if config is None: + column_config = cls() + elif isinstance(config, dict): + column_config = cls(**config) + elif isinstance(config, Self): + column_config = config + else: + raise TypeError( + f"Invalid column config type: {type(config)}. " + "Expected ColumnConfig instance or dict." + ) + + return column_config + + @runtime_checkable class Datagram(ContentIdentifiable, Protocol): """ @@ -139,9 +281,9 @@ def get(self, key: str, default: DataValue = None) -> DataValue: # 3. Structural Information def keys( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[str, ...]: """ Return tuple of column names. @@ -172,11 +314,11 @@ def keys( """ ... - def types( + def schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> PythonSchema: """ Return type specification mapping field names to Python types. @@ -202,9 +344,9 @@ def types( def arrow_schema( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Schema": """ Return PyArrow schema representation. @@ -233,9 +375,9 @@ def arrow_schema( # 4. Format Conversions (Export) def as_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, DataValue]: """ Convert datagram to dictionary format. @@ -267,9 +409,9 @@ def as_dict( def as_table( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pa.Table": """ Convert datagram to PyArrow Table format. @@ -301,9 +443,9 @@ def as_table( def as_arrow_compatible_dict( self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> dict[str, Any]: """ Return dictionary with values optimized for Arrow table conversion. @@ -612,214 +754,6 @@ class Tag(Datagram, Protocol): - Quality indicators or confidence scores """ - def keys( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> tuple[str, ...]: - """ - Return tuple of column names. - - Provides access to column names with filtering options for different - column types. Default returns only data column names. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Return only data column names (default) - - True: Include all meta column names - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - Tuple of column names based on inclusion criteria. - - Example: - >>> datagram.keys() # Data columns only - ('user_id', 'name', 'email') - >>> datagram.keys(include_meta_columns=True) - ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_meta_columns=["pipeline"]) - ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_context=True) - ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') - """ - ... - - def types( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> PythonSchema: - """ - Return type specification mapping field names to Python types. - - The TypeSpec enables type checking and validation throughout the system. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column type inclusion. - - False: Exclude meta column types (default) - - True: Include all meta column types - - Collection[str]: Include meta column types matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context type. - include_source: Whether to include source info fields. - - Returns: - TypeSpec mapping field names to their Python types. - - Example: - >>> datagram.types() - {'user_id': , 'name': } - """ - ... - - def arrow_schema( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> "pa.Schema": - """ - Return PyArrow schema representation. - - The schema provides structured field and type information for efficient - serialization and deserialization with PyArrow. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column schema inclusion. - - False: Exclude meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - PyArrow Schema describing the datagram structure. - - Example: - >>> schema = datagram.arrow_schema() - >>> schema.names - ['user_id', 'name'] - """ - ... - - def as_dict( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> dict[str, DataValue]: - """ - Convert datagram to dictionary format. - - Provides a simple key-value representation useful for debugging, - serialization, and interop with dict-based APIs. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context key. - include_source: Whether to include source info fields. - - - Returns: - Dictionary with requested columns as key-value pairs. - - Example: - >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} - >>> full_data = datagram.as_dict( - ... include_meta_columns=True, - ... include_context=True - ... ) - """ - ... - - def as_table( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> "pa.Table": - """ - Convert datagram to PyArrow Table format. - - Provides a standardized columnar representation suitable for analysis, - processing, and interoperability with Arrow-based tools. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context column. - include_source: Whether to include source info columns in the schema. - - Returns: - PyArrow Table with requested columns. - - Example: - >>> table = datagram.as_table() # Data columns only - >>> full_table = datagram.as_table( - ... include_meta_columns=True, - ... include_context=True - ... ) - >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" - """ - ... - - # TODO: add this back - # def as_arrow_compatible_dict( - # self, - # include_all_info: bool = False, - # include_meta_columns: bool | Collection[str] = False, - # include_context: bool = False, - # include_source: bool = False, - # ) -> dict[str, Any]: - # """Extended version with source info support.""" - # ... - - def as_datagram( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_system_tags: bool = False, - ) -> Datagram: - """ - Convert the packet to a Datagram. - - Args: - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - - Returns: - Datagram: Datagram representation of packet data - """ - ... - def system_tags(self) -> dict[str, DataValue]: """ Return metadata about the packet's source/origin. @@ -855,214 +789,6 @@ class Packet(Datagram, Protocol): data flow: Tags provide context, Packets provide content. """ - def keys( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> tuple[str, ...]: - """ - Return tuple of column names. - - Provides access to column names with filtering options for different - column types. Default returns only data column names. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Return only data column names (default) - - True: Include all meta column names - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - Tuple of column names based on inclusion criteria. - - Example: - >>> datagram.keys() # Data columns only - ('user_id', 'name', 'email') - >>> datagram.keys(include_meta_columns=True) - ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_meta_columns=["pipeline"]) - ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_context=True) - ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') - """ - ... - - def types( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> PythonSchema: - """ - Return type specification mapping field names to Python types. - - The TypeSpec enables type checking and validation throughout the system. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column type inclusion. - - False: Exclude meta column types (default) - - True: Include all meta column types - - Collection[str]: Include meta column types matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context type. - include_source: Whether to include source info fields. - - Returns: - TypeSpec mapping field names to their Python types. - - Example: - >>> datagram.types() - {'user_id': , 'name': } - """ - ... - - def arrow_schema( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> "pa.Schema": - """ - Return PyArrow schema representation. - - The schema provides structured field and type information for efficient - serialization and deserialization with PyArrow. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column schema inclusion. - - False: Exclude meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - PyArrow Schema describing the datagram structure. - - Example: - >>> schema = datagram.arrow_schema() - >>> schema.names - ['user_id', 'name'] - """ - ... - - def as_dict( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> dict[str, DataValue]: - """ - Convert datagram to dictionary format. - - Provides a simple key-value representation useful for debugging, - serialization, and interop with dict-based APIs. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context key. - include_source: Whether to include source info fields. - - - Returns: - Dictionary with requested columns as key-value pairs. - - Example: - >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} - >>> full_data = datagram.as_dict( - ... include_meta_columns=True, - ... include_context=True - ... ) - """ - ... - - def as_table( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> "pa.Table": - """ - Convert datagram to PyArrow Table format. - - Provides a standardized columnar representation suitable for analysis, - processing, and interoperability with Arrow-based tools. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context column. - include_source: Whether to include source info columns in the schema. - - Returns: - PyArrow Table with requested columns. - - Example: - >>> table = datagram.as_table() # Data columns only - >>> full_table = datagram.as_table( - ... include_meta_columns=True, - ... include_context=True - ... ) - >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" - """ - ... - - # TODO: add this back - # def as_arrow_compatible_dict( - # self, - # include_all_info: bool = False, - # include_meta_columns: bool | Collection[str] = False, - # include_context: bool = False, - # include_source: bool = False, - # ) -> dict[str, Any]: - # """Extended version with source info support.""" - # ... - - def as_datagram( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_source: bool = False, - ) -> Datagram: - """ - Convert the packet to a Datagram. - - Args: - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - - Returns: - Datagram: Datagram representation of packet data - """ - ... - def source_info(self) -> dict[str, str | None]: """ Return metadata about the packet's source/origin. diff --git a/src/orcapod/protocols/core_protocols/function_pod.py b/src/orcapod/protocols/core_protocols/function_pod.py new file mode 100644 index 0000000..2b6108b --- /dev/null +++ b/src/orcapod/protocols/core_protocols/function_pod.py @@ -0,0 +1,33 @@ +from typing import Protocol, runtime_checkable + +from orcapod.protocols.core_protocols.datagrams import Packet +from orcapod.protocols.core_protocols.packet_function import PacketFunction +from orcapod.protocols.core_protocols.pod import Pod + + +@runtime_checkable +class FunctionPod(Pod, Protocol): + """ + Pod based on PacketFunction. + + + """ + + @property + def packet_function(self) -> PacketFunction: + """ + The PacketFunction that defines the computation for this FunctionPod. + """ + ... + + def process_packet(self, packet: Packet) -> Packet | None: + """ + Process a single packet using the pod's PacketFunction. + + Args: + packet (Packet): The input packet to process. + + Returns: + Packet | None: The processed packet, or None if filtered out. + """ + ... diff --git a/src/orcapod/protocols/core_protocols/kernel.py b/src/orcapod/protocols/core_protocols/kernel.py deleted file mode 100644 index 842d7af..0000000 --- a/src/orcapod/protocols/core_protocols/kernel.py +++ /dev/null @@ -1,201 +0,0 @@ -from collections.abc import Collection -from datetime import datetime -from typing import Any, Protocol, runtime_checkable -from orcapod.protocols.hashing_protocols import ContentIdentifiable -from orcapod.types import PythonSchema -from orcapod.protocols.core_protocols.base import Labelable -from orcapod.protocols.core_protocols.streams import Stream, LiveStream - - -@runtime_checkable -class Kernel(ContentIdentifiable, Labelable, Protocol): - """ - The fundamental unit of computation in Orcapod. - - Kernels are the building blocks of computational graphs, transforming - zero, one, or more input streams into a single output stream. They - encapsulate computation logic while providing consistent interfaces - for validation, type checking, and execution. - - Key design principles: - - Immutable: Kernels don't change after creation - - Deterministic: Same inputs always produce same outputs - - Composable: Kernels can be chained and combined - - Trackable: All invocations are recorded for lineage - - Type-safe: Strong typing and validation throughout - - Execution modes: - - __call__(): Full-featured execution with tracking, returns LiveStream - - forward(): Pure computation without side effects, returns Stream - - The distinction between these modes enables both production use (with - full tracking) and testing/debugging (without side effects). - """ - - @property - def reference(self) -> tuple[str, ...]: - """ - Reference to the kernel - - The reference is used for caching/storage and tracking purposes. - As the name indicates, this is how data originating from the kernel will be referred to. - - - Returns: - tuple[str, ...]: Reference for this kernel - """ - ... - - @property - def data_context_key(self) -> str: - """ - Return the context key for this kernel's data processing. - - The context key is used to interpret how data columns should be - processed and converted. It provides semantic meaning to the data - being processed by this kernel. - - Returns: - str: Context key for this kernel's data processing - """ - ... - - @property - def last_modified(self) -> datetime | None: - """ - When the kernel was last modified. For most kernels, this is the timestamp - of the kernel creation. - """ - ... - - def __call__( - self, *streams: Stream, label: str | None = None, **kwargs - ) -> LiveStream: - """ - Main interface for kernel invocation with full tracking and guarantees. - - This is the primary way to invoke kernels in production. It provides - a complete execution pipeline: - 1. Validates input streams against kernel requirements - 2. Registers the invocation with the computational graph - 3. Calls forward() to perform the actual computation - 4. Ensures the result is a LiveStream that stays current - - The returned LiveStream automatically stays up-to-date with its - upstream dependencies, making it suitable for real-time processing - and reactive applications. - - Args: - *streams: Input streams to process (can be empty for source kernels) - label: Optional label for this invocation (overrides kernel.label) - **kwargs: Additional arguments for kernel configuration - - Returns: - LiveStream: Live stream that stays up-to-date with upstreams - - Raises: - ValidationError: If input streams are invalid for this kernel - TypeMismatchError: If stream types are incompatible - ValueError: If required arguments are missing - """ - ... - - def forward(self, *streams: Stream) -> Stream: - """ - Perform the actual computation without side effects. - - This method contains the core computation logic and should be - overridden by subclasses. It performs pure computation without: - - Registering with the computational graph - - Performing validation (caller's responsibility) - - Guaranteeing result type (may return static or live streams) - - The returned stream must be accurate at the time of invocation but - need not stay up-to-date with upstream changes. This makes forward() - suitable for: - - Testing and debugging - - Batch processing where currency isn't required - - Internal implementation details - - Args: - *streams: Input streams to process - - Returns: - Stream: Result of the computation (may be static or live) - """ - ... - - def output_types( - self, *streams: Stream, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Determine output types without triggering computation. - - This method performs type inference based on input stream types, - enabling efficient type checking and stream property queries. - It should be fast and not trigger any expensive computation. - - Used for: - - Pre-execution type validation - - Query planning and optimization - - Schema inference in complex pipelines - - IDE support and developer tooling - - Args: - *streams: Input streams to analyze - - Returns: - tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output - - Raises: - ValidationError: If input types are incompatible - TypeError: If stream types cannot be processed - """ - ... - - def validate_inputs(self, *streams: Stream) -> None: - """ - Validate input streams, raising exceptions if incompatible. - - This method is called automatically by __call__ before computation - to provide fail-fast behavior. It should check: - - Number of input streams - - Stream types and schemas - - Any kernel-specific requirements - - Business logic constraints - - The goal is to catch errors early, before expensive computation - begins, and provide clear error messages for debugging. - - Args: - *streams: Input streams to validate - - Raises: - ValidationError: If streams are invalid for this kernel - TypeError: If stream types are incompatible - ValueError: If stream content violates business rules - """ - ... - - def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: - """ - Generate a unique identity structure for this kernel and/or kernel invocation. - When invoked without streams, it should return a structure - that uniquely identifies the kernel itself (e.g., class name, parameters). - When invoked with streams, it should include the identity of the streams - to distinguish different invocations of the same kernel. - - This structure is used for: - - Caching and memoization - - Debugging and error reporting - - Tracking kernel invocations in computational graphs - - Args: - streams: Optional input streams for this invocation. If None, identity_structure is - based solely on the kernel. If streams are provided, they are included in the identity - to differentiate between different invocations of the same kernel. - - Returns: - Any: Unique identity structure (e.g., tuple of class name and stream identities) - """ - ... diff --git a/src/orcapod/protocols/core_protocols/labelable.py b/src/orcapod/protocols/core_protocols/labelable.py new file mode 100644 index 0000000..51c47f7 --- /dev/null +++ b/src/orcapod/protocols/core_protocols/labelable.py @@ -0,0 +1,47 @@ +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class Labelable(Protocol): + """ + Protocol for objects that can have a human-readable label. + + Labels provide meaningful names for objects in the computational graph, + making debugging, visualization, and monitoring much easier. They serve + as human-friendly identifiers that complement the technical identifiers + used internally. + + Labels are optional but highly recommended for: + - Debugging complex computational graphs + - Visualization and monitoring tools + - Error messages and logging + - User interfaces and dashboards + + """ + + @property + def label(self) -> str: + """ + Return the human-readable label for this object. + + Labels should be descriptive and help users understand the purpose + or role of the object in the computational graph. + + Returns: + str: Human-readable label for this object + None: No label is set (will use default naming) + """ + ... + + @label.setter + def label(self, label: str | None) -> None: + """ + Set the human-readable label for this object. + + Labels should be descriptive and help users understand the purpose + or role of the object in the computational graph. + + Args: + value (str): Human-readable label for this object + """ + ... diff --git a/src/orcapod/protocols/core_protocols/operator_pod.py b/src/orcapod/protocols/core_protocols/operator_pod.py new file mode 100644 index 0000000..f24b729 --- /dev/null +++ b/src/orcapod/protocols/core_protocols/operator_pod.py @@ -0,0 +1,12 @@ +from typing import Protocol, runtime_checkable + +from orcapod.protocols.core_protocols.pod import Pod + + +@runtime_checkable +class OperatorPod(Pod, Protocol): + """ + Pod that performs operations on streams. + + This is a base protocol for pods that perform operations on streams. + """ diff --git a/src/orcapod/protocols/core_protocols/packet_function.py b/src/orcapod/protocols/core_protocols/packet_function.py new file mode 100644 index 0000000..c501f01 --- /dev/null +++ b/src/orcapod/protocols/core_protocols/packet_function.py @@ -0,0 +1,140 @@ +from typing import Any, Protocol, runtime_checkable + +from orcapod.protocols.core_protocols.datagrams import Packet +from orcapod.types import PythonSchema + + +@runtime_checkable +class PacketFunction(Protocol): + """ + Protocol for packet-processing function. + + Processes individual packets with declared input/output schemas. + """ + + # ==================== Identity & Metadata ==================== + @property + def packet_function_type_id(self) -> str: + """How functions are defined and executed (e.g., python.function.v2)""" + ... + + @property + def canonical_function_name(self) -> str: + """Human-readable function identifier""" + ... + + @property + def major_version(self) -> int: + """Breaking changes increment this""" + ... + + @property + def minor_version_string(self) -> str: + """Flexible minor version (e.g., "1", "4.3rc", "apple")""" + ... + + @property + def input_packet_schema(self) -> PythonSchema: + """ + Schema for input packets that this packet function can process. + + Defines the exact schema that input packets must conform to. + + This specification is used for: + - Runtime type validation + - Compile-time type checking + - Schema inference and documentation + - Input validation and error reporting + + Returns: + PythonSchema: Output packet schema as a dictionary mapping + """ + ... + + @property + def output_packet_schema(self) -> PythonSchema: + """ + Schema for output packets that this packet function produces. + + This is typically determined by the packet function's computational logic + and is used for: + - Type checking downstream kernels + - Schema inference in complex pipelines + - Query planning and optimization + - Documentation and developer tooling + + Returns: + PythonSchema: Output packet schema as a dictionary mapping + """ + ... + + # ==================== Content-Addressable Identity ==================== + def get_function_variation_data(self) -> dict[str, Any]: + """Raw data defining function variation - system computes hash""" + ... + + def get_execution_data(self) -> dict[str, Any]: + """Raw data defining execution context - system computes hash""" + ... + + async def async_call( + self, + packet: Packet, + ) -> Packet | None: + """ + Asynchronously process a single packet + + This is the core method that defines the packet function's computational behavior. + It processes one packet at a time, enabling: + - Fine-grained caching at the packet level + - Parallelization opportunities + - Just-in-time evaluation + - Filtering operations (by returning None) + + The method signature supports: + - Packet transformation (modify content) + - Filtering (return None to exclude packet) + - Pass-through (return inputs unchanged) + + Args: + packet: The data payload to process + + Returns: + Packet | None: Processed packet, or None to filter it out + + Raises: + TypeError: If packet doesn't match input_packet_types + ValueError: If packet data is invalid for processing + """ + ... + + def call( + self, + packet: Packet, + ) -> Packet | None: + """ + Process a single packet + + This is the core method that defines the packet function's computational behavior. + It processes one packet at a time, enabling: + - Fine-grained caching at the packet level + - Parallelization opportunities + - Just-in-time evaluation + - Filtering operations (by returning None) + + The method signature supports: + - Packet transformation (modify content) + - Filtering (return None to exclude packet) + - Pass-through (return inputs unchanged) + + Args: + packet: The data payload to process + + Returns: + Packet | None: Processed packet, or None to filter it out + + Raises: + TypeError: If packet doesn't match input_packet_types + ValueError: If packet data is invalid for processing + """ + ... diff --git a/src/orcapod/protocols/core_protocols/pod.py b/src/orcapod/protocols/core_protocols/pod.py new file mode 100644 index 0000000..39d947b --- /dev/null +++ b/src/orcapod/protocols/core_protocols/pod.py @@ -0,0 +1,147 @@ +from collections.abc import Collection +from typing import Any, Protocol, TypeAlias, runtime_checkable + +from orcapod.protocols.core_protocols.datagrams import ColumnConfig +from orcapod.protocols.core_protocols.labelable import Labelable +from orcapod.protocols.core_protocols.streams import Stream +from orcapod.protocols.core_protocols.temporal import Temporal +from orcapod.protocols.hashing_protocols import ContentIdentifiable, DataContextAware +from orcapod.types import PythonSchema + +# Core recursive types +ArgumentGroup: TypeAlias = "SymmetricGroup | OrderedGroup | Stream" + +SymmetricGroup: TypeAlias = frozenset[ArgumentGroup] # Order-independent +OrderedGroup: TypeAlias = tuple[ArgumentGroup, ...] # Order-dependent + + +@runtime_checkable +class Pod(DataContextAware, ContentIdentifiable, Labelable, Temporal, Protocol): + """ + The fundamental unit of computation in Orcapod. + + Pods are the building blocks of computational graphs, transforming + zero, one, or more input streams into a single output stream. They + encapsulate computation logic while providing consistent interfaces + for validation, type checking, and execution. + + Key design principles: + - Immutable: Pods don't change after creation + - Composable: Pods can be chained and combined + - Type-safe: Strong typing and validation throughout + + Execution modes: + - __call__(): Full-featured execution with tracking, returns LiveStream + - forward(): Pure computation without side effects, returns Stream + + The distinction between these modes enables both production use (with + full tracking) and testing/debugging (without side effects). + """ + + @property + def uri(self) -> tuple[str, ...]: + """ + Unique identifier for the pod + + The URI is used for caching/storage and tracking purposes. + As the name indicates, this is how data originating from the kernel will be referred to. + + + Returns: + tuple[str, ...]: URI for this pod + """ + ... + + def validate_inputs(self, *streams: Stream) -> None: + """ + Validate input streams, raising exceptions if invalid. + + Should check: + - Number of input streams + - Stream types and schemas + - Kernel-specific requirements + - Business logic constraints + + Args: + *streams: Input streams to validate + + Raises: + PodInputValidationError: If inputs are invalid + """ + ... + + def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: + """ + Describe symmetry/ordering constraints on input arguments. + + Returns a structure encoding which arguments can be reordered: + - SymmetricGroup (frozenset): Arguments commute (order doesn't matter) + - OrderedGroup (tuple): Arguments have fixed positions + - Nesting expresses partial symmetry + + Examples: + Full symmetry (Join): + return frozenset([a, b, c]) + + No symmetry (Concatenate): + return (a, b, c) + + Partial symmetry: + return (frozenset([a, b]), c) + # a,b are interchangeable, c has fixed position + """ + ... + + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + """ + Determine output schemas without triggering computation. + + This method performs type inference based on input stream types, + enabling efficient type checking and stream property queries. + It should be fast and not trigger any expensive computation. + + Used for: + - Pre-execution type validation + - Query planning and optimization + - Schema inference in complex pipelines + - IDE support and developer tooling + + Args: + *streams: Input streams to analyze + + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output + + Raises: + ValidationError: If input types are incompatible + TypeError: If stream types cannot be processed + """ + ... + + def process(self, *streams: Stream) -> Stream: + """ + Executes the computation on zero or more input streams. + This method contains the core computation logic and should be + overridden by subclasses. It performs pure computation without: + - Performing validation (caller's responsibility) + - Guaranteeing result type (may return static or live streams) + + The returned stream must be accurate at the time of invocation but + need not stay up-to-date with upstream changes. This makes forward() + suitable for: + - Testing and debugging + - Batch processing where currency isn't required + - Internal implementation details + + Args: + *streams: Input streams to process + + Returns: + Stream: Result of the computation (may be static or live) + """ + ... diff --git a/src/orcapod/protocols/core_protocols/pods.py b/src/orcapod/protocols/core_protocols/pods.py deleted file mode 100644 index b3c9513..0000000 --- a/src/orcapod/protocols/core_protocols/pods.py +++ /dev/null @@ -1,228 +0,0 @@ -from typing import TYPE_CHECKING, Protocol, runtime_checkable - -from orcapod.protocols.core_protocols.base import ExecutionEngine -from orcapod.protocols.core_protocols.datagrams import Packet, Tag -from orcapod.protocols.core_protocols.kernel import Kernel -from orcapod.types import PythonSchema - -if TYPE_CHECKING: - import pyarrow as pa - - -@runtime_checkable -class Pod(Kernel, Protocol): - """ - Specialized kernel for packet-level processing with advanced caching. - - Pods represent a different computational model from regular kernels: - - Process data one packet at a time (enabling fine-grained parallelism) - - Support just-in-time evaluation (computation deferred until needed) - - Provide stricter type contracts (clear input/output schemas) - - Enable advanced caching strategies (packet-level caching) - - The Pod abstraction is ideal for: - - Expensive computations that benefit from caching - - Operations that can be parallelized at the packet level - - Transformations with strict type contracts - - Processing that needs to be deferred until access time - - Functions that operate on individual data items - - Pods use a different execution model where computation is deferred - until results are actually needed, enabling efficient resource usage - and fine-grained caching. - """ - - @property - def version(self) -> str: ... - - def get_record_id(self, packet: Packet, execution_engine_hash: str) -> str: ... - - @property - def tiered_pod_id(self) -> dict[str, str]: - """ - Return a dictionary representation of the tiered pod's unique identifier. - The key is supposed to be ordered from least to most specific, allowing - for hierarchical identification of the pod. - - This is primarily used for tiered memoization/caching strategies. - - Returns: - dict[str, str]: Dictionary representation of the pod's ID - """ - ... - - def input_packet_types(self) -> PythonSchema: - """ - TypeSpec for input packets that this Pod can process. - - Defines the exact schema that input packets must conform to. - Pods are typically much stricter about input types than regular - kernels, requiring precise type matching for their packet-level - processing functions. - - This specification is used for: - - Runtime type validation - - Compile-time type checking - - Schema inference and documentation - - Input validation and error reporting - - Returns: - TypeSpec: Dictionary mapping field names to required packet types - """ - ... - - def output_packet_types(self) -> PythonSchema: - """ - TypeSpec for output packets that this Pod produces. - - Defines the schema of packets that will be produced by this Pod. - This is typically determined by the Pod's computational function - and is used for: - - Type checking downstream kernels - - Schema inference in complex pipelines - - Query planning and optimization - - Documentation and developer tooling - - Returns: - TypeSpec: Dictionary mapping field names to output packet types - """ - ... - - async def async_call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - ) -> tuple[Tag, Packet | None]: ... - - def call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - ) -> tuple[Tag, Packet | None]: - """ - Process a single packet with its associated tag. - - This is the core method that defines the Pod's computational behavior. - It processes one (tag, packet) pair at a time, enabling: - - Fine-grained caching at the packet level - - Parallelization opportunities - - Just-in-time evaluation - - Filtering operations (by returning None) - - The method signature supports: - - Tag transformation (modify metadata) - - Packet transformation (modify content) - - Filtering (return None to exclude packet) - - Pass-through (return inputs unchanged) - - Args: - tag: Metadata associated with the packet - packet: The data payload to process - - Returns: - tuple[Tag, Packet | None]: - - Tag: Output tag (may be modified from input) - - Packet: Processed packet, or None to filter it out - - Raises: - TypeError: If packet doesn't match input_packet_types - ValueError: If packet data is invalid for processing - """ - ... - - -@runtime_checkable -class CachedPod(Pod, Protocol): - async def async_call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - skip_cache_lookup: bool = False, - skip_cache_insert: bool = False, - ) -> tuple[Tag, Packet | None]: ... - - def set_mode(self, mode: str) -> None: ... - - @property - def mode(self) -> str: ... - - # @mode.setter - # def mode(self, value: str) -> None: ... - - def call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - skip_cache_lookup: bool = False, - skip_cache_insert: bool = False, - ) -> tuple[Tag, Packet | None]: - """ - Process a single packet with its associated tag. - - This is the core method that defines the Pod's computational behavior. - It processes one (tag, packet) pair at a time, enabling: - - Fine-grained caching at the packet level - - Parallelization opportunities - - Just-in-time evaluation - - Filtering operations (by returning None) - - The method signature supports: - - Tag transformation (modify metadata) - - Packet transformation (modify content) - - Filtering (return None to exclude packet) - - Pass-through (return inputs unchanged) - - Args: - tag: Metadata associated with the packet - packet: The data payload to process - - Returns: - tuple[Tag, Packet | None]: - - Tag: Output tag (may be modified from input) - - Packet: Processed packet, or None to filter it out - - Raises: - TypeError: If packet doesn't match input_packet_types - ValueError: If packet data is invalid for processing - """ - ... - - def get_cached_output_for_packet(self, input_packet: Packet) -> Packet | None: - """ - Retrieve the cached output packet for a given input packet. - - Args: - input_packet: The input packet to look up in the cache - - Returns: - Packet | None: The cached output packet, or None if not found - """ - ... - - def get_all_cached_outputs( - self, include_system_columns: bool = False - ) -> "pa.Table | None": - """ - Retrieve all packets processed by this Pod. - - This method returns a table containing all packets processed by the Pod, - including metadata and system columns if requested. It is useful for: - - Debugging and analysis - - Auditing and data lineage tracking - - Performance monitoring - - Args: - include_system_columns: Whether to include system columns in the output - - Returns: - pa.Table | None: A table containing all processed records, or None if no records are available - """ - ... diff --git a/src/orcapod/protocols/core_protocols/source.py b/src/orcapod/protocols/core_protocols/source_pod.py similarity index 91% rename from src/orcapod/protocols/core_protocols/source.py rename to src/orcapod/protocols/core_protocols/source_pod.py index e94f336..8545c7c 100644 --- a/src/orcapod/protocols/core_protocols/source.py +++ b/src/orcapod/protocols/core_protocols/source_pod.py @@ -1,11 +1,11 @@ from typing import Protocol, runtime_checkable -from orcapod.protocols.core_protocols.kernel import Kernel +from orcapod.protocols.core_protocols.pod import Pod from orcapod.protocols.core_protocols.streams import Stream @runtime_checkable -class Source(Kernel, Stream, Protocol): +class SourcePod(Pod, Stream, Protocol): """ Entry point for data into the computational graph. diff --git a/src/orcapod/protocols/core_protocols/streams.py b/src/orcapod/protocols/core_protocols/streams.py index 36cd369..85b490c 100644 --- a/src/orcapod/protocols/core_protocols/streams.py +++ b/src/orcapod/protocols/core_protocols/streams.py @@ -1,21 +1,22 @@ from collections.abc import Collection, Iterator, Mapping -from datetime import datetime from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable -from orcapod.protocols.core_protocols.base import ExecutionEngine, Labelable -from orcapod.protocols.core_protocols.datagrams import Packet, Tag +from orcapod.protocols.core_protocols.datagrams import ColumnConfig, Packet, Tag +from orcapod.protocols.core_protocols.labelable import Labelable +from orcapod.protocols.core_protocols.temporal import Temporal from orcapod.protocols.hashing_protocols import ContentIdentifiable from orcapod.types import PythonSchema if TYPE_CHECKING: + import pandas as pd import polars as pl import pyarrow as pa - import pandas as pd - from orcapod.protocols.core_protocols.kernel import Kernel + + from orcapod.protocols.core_protocols.pod import Pod @runtime_checkable -class Stream(ContentIdentifiable, Labelable, Protocol): +class Stream(ContentIdentifiable, Labelable, Temporal, Protocol): """ Base protocol for all streams in Orcapod. @@ -35,70 +36,20 @@ class Stream(ContentIdentifiable, Labelable, Protocol): - Conversion to common formats (tables, dictionaries) """ - @property - def substream_identities(self) -> tuple[str, ...]: - """ - Unique identifiers for sub-streams within this stream. - - This property provides a way to identify and differentiate - sub-streams that may be part of a larger stream. It is useful - for tracking and managing complex data flows. - - Returns: - tuple[str, ...]: Unique identifiers for each sub-stream - """ - ... - - @property - def execution_engine(self) -> ExecutionEngine | None: - """ - The execution engine attached to this stream. By default, the stream - will use this execution engine whenever it needs to perform computation. - None means the stream is not attached to any execution engine and will default - to running natively. - """ - - @execution_engine.setter - def execution_engine(self, engine: ExecutionEngine | None) -> None: - """ - Set the execution engine for this stream. - - This allows the stream to use a specific execution engine for - computation, enabling optimized execution strategies and resource - management. - - Args: - engine: The execution engine to attach to this stream - """ - ... - - def get_substream(self, substream_id: str) -> "Stream": - """ - Retrieve a specific sub-stream by its identifier. - - This method allows access to individual sub-streams within the - main stream, enabling focused operations on specific data segments. - - Args: - substream_id: Unique identifier for the desired sub-stream. - - Returns: - Stream: The requested sub-stream if it exists - """ - ... + # TODO: add substream system @property - def source(self) -> "Kernel | None": + def source(self) -> "Pod | None": """ - The kernel that produced this stream. + The pod that produced this stream, if any. This provides lineage information for tracking data flow through the computational graph. Root streams (like file sources) may - have no source kernel. + have no source pod. Returns: - Kernel: The source kernel that created this stream - None: This is a root stream with no source kernel + Pod: The source pod that created this stream + None: This is a root stream with no source pod """ ... @@ -108,8 +59,9 @@ def upstreams(self) -> tuple["Stream", ...]: Input streams used to produce this stream. These are the streams that were provided as input to the source - kernel when this stream was created. Used for dependency tracking - and cache invalidation. + pod when this stream was created. Used for dependency tracking + and cache invalidation. Note that `source` must be checked for + upstreams to be meaningfully inspected. Returns: tuple[Stream, ...]: Upstream dependency streams (empty for sources) @@ -117,7 +69,10 @@ def upstreams(self) -> tuple["Stream", ...]: ... def keys( - self, include_system_tags: bool = False + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Available keys/fields in the stream content. @@ -134,22 +89,11 @@ def keys( """ ... - def tag_keys(self, include_system_tags: bool = False) -> tuple[str, ...]: - """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. - """ - ... - - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - ... - - def types( - self, include_system_tags: bool = False + def output_schema( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> tuple[PythonSchema, PythonSchema]: """ Type specifications for the stream content. @@ -161,140 +105,51 @@ def types( - Compatibility checking between kernels Returns: - tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) - """ - ... - - def tag_types(self, include_system_tags: bool = False) -> PythonSchema: - """ - Type specifications for the stream content. - - Returns the type schema for both tags and packets in this stream. - This information is used for: - - Type checking and validation - - Schema inference and planning - - Compatibility checking between kernels - - Returns: - tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) - """ - ... - - def packet_types(self) -> PythonSchema: ... - - @property - def last_modified(self) -> datetime | None: - """ - When the stream's content was last modified. - - This property is crucial for caching decisions and dependency tracking: - - datetime: Content was last modified at this time (cacheable) - - None: Content is never stable, always recompute (some dynamic streams) - - Both static and live streams typically return datetime values, but - live streams update this timestamp whenever their content changes. - - Returns: - datetime: Timestamp of last modification for most streams - None: Stream content is never stable (some special dynamic streams) - """ - ... - - @property - def is_current(self) -> bool: - """ - Whether the stream is up-to-date with its dependencies. - - A stream is current if its content reflects the latest state of its - source kernel and upstream streams. This is used for cache validation - and determining when refresh is needed. - - For live streams, this should always return True since they stay - current automatically. For static streams, this indicates whether - the cached content is still valid. - - Returns: - bool: True if stream is up-to-date, False if refresh needed + tuple[PythonSchema, PythonSchema]: (tag_types, packet_types) """ ... - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: """ - Iterate over (tag, packet) pairs in the stream. + Generates explicit iterator over (tag, packet) pairs in the stream. - This is the primary way to access stream data. The behavior depends - on the stream type: - - Static streams: Return cached/precomputed data - - Live streams: May trigger computation and always reflect current state + Note that multiple invocation of `iter_packets` may not always + return an identical iterator. Yields: tuple[Tag, Packet]: Sequential (tag, packet) pairs """ ... - def iter_packets( - self, execution_engine: ExecutionEngine | None = None - ) -> Iterator[tuple[Tag, Packet]]: - """ - Alias for __iter__ for explicit packet iteration. - - Provides a more explicit method name when the intent is to iterate - over packets specifically, improving code readability. - - This method must return an immutable iterator -- that is, the returned iterator - should not change and must consistently return identical tag,packet pairs across - multiple iterations of the iterator. - - Note that this is NOT to mean that multiple invocation of `iter_packets` must always - return an identical iterator. The iterator returned by `iter_packets` may change - between invocations, but the iterator itself must not change. Consequently, it should be understood - that the returned iterators may be a burden on memory if the stream is large or infinite. - - Yields: - tuple[Tag, Packet]: Sequential (tag, packet) pairs - """ - ... - - def run( - self, *args: Any, execution_engine: ExecutionEngine | None = None, **kwargs: Any - ) -> None: + def as_table( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> "pa.Table": """ - Execute the stream using the provided execution engine. + Convert the entire stream to a PyArrow Table. - This method triggers computation of the stream content based on its - source kernel and upstream streams. It returns a new stream instance - containing the computed (tag, packet) pairs. + Materializes all (tag, packet) pairs into a single table for + analysis and processing. This operation may be expensive for + large streams or live streams that need computation. - Args: - execution_engine: The execution engine to use for computation + If include_content_hash is True, an additional column called "_content_hash" + containing the content hash of each packet is included. If include_content_hash + is a string, it is used as the name of the content hash column. + Returns: + pa.Table: Complete stream data as a PyArrow Table """ ... - async def run_async( - self, *args: Any, execution_engine: ExecutionEngine | None = None, **kwargs: Any - ) -> None: - """ - Asynchronously execute the stream using the provided execution engine. - - This method triggers computation of the stream content based on its - source kernel and upstream streams. It returns a new stream instance - containing the computed (tag, packet) pairs. - - Args: - execution_engine: The execution engine to use for computation - - """ - ... +class StreamWithOperations(Stream, Protocol): def as_df( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pl.DataFrame": """ Convert the entire stream to a Polars DataFrame. @@ -303,12 +158,9 @@ def as_df( def as_lazy_frame( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: ExecutionEngine | None = None, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, ) -> "pl.LazyFrame": """ Load the entire stream to a Polars LazyFrame. @@ -317,53 +169,28 @@ def as_lazy_frame( def as_polars_df( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: ExecutionEngine | None = None, - ) -> "pl.DataFrame": ... + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> "pl.DataFrame": + """ + Convert the entire stream to a Polars DataFrame. + """ + ... def as_pandas_df( self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - index_by_tags: bool = True, - execution_engine: ExecutionEngine | None = None, - ) -> "pd.DataFrame": ... - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: ExecutionEngine | None = None, - ) -> "pa.Table": + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> "pd.DataFrame": """ - Convert the entire stream to a PyArrow Table. - - Materializes all (tag, packet) pairs into a single table for - analysis and processing. This operation may be expensive for - large streams or live streams that need computation. - - If include_content_hash is True, an additional column called "_content_hash" - containing the content hash of each packet is included. If include_content_hash - is a string, it is used as the name of the content hash column. - - Returns: - pa.Table: Complete stream data as a PyArrow Table + Convert the entire stream to a Pandas DataFrame. """ ... def flow( self, - execution_engine: ExecutionEngine | None = None, ) -> Collection[tuple[Tag, Packet]]: """ Return the entire stream as a collection of (tag, packet) pairs. @@ -512,76 +339,3 @@ def batch( Self: New stream containing batched (tag, packet) pairs. """ ... - - -@runtime_checkable -class LiveStream(Stream, Protocol): - """ - A stream that automatically stays up-to-date with its upstream dependencies. - - LiveStream extends the base Stream protocol with capabilities for "up-to-date" - data flow and reactive computation. Unlike static streams which represent - snapshots, LiveStreams provide the guarantee that their content always - reflects the current state of their dependencies. - - Key characteristics: - - Automatically refresh the stream if changes in the upstreams are detected - - Track last_modified timestamp when content changes - - Support manual refresh triggering and invalidation - - By design, LiveStream would return True for is_current except when auto-update fails. - - LiveStreams are always returned by Kernel.__call__() methods, ensuring - that normal kernel usage produces live, up-to-date results. - - Caching behavior: - - last_modified updates whenever content changes - - Can be cached based on dependency timestamps - - Invalidation happens automatically when upstreams change - - Use cases: - - Real-time data processing pipelines - - Reactive user interfaces - - Monitoring and alerting systems - - Dynamic dashboard updates - - Any scenario requiring current data - """ - - def refresh(self, force: bool = False) -> bool: - """ - Manually trigger a refresh of this stream's content. - - Forces the stream to check its upstream dependencies and update - its content if necessary. This is useful when: - - You want to ensure the latest data before a critical operation - - You need to force computation at a specific time - - You're debugging data flow issues - - You want to pre-compute results for performance - Args: - force: If True, always refresh even if the stream is current. - If False, only refresh if the stream is not current. - - Returns: - bool: True if the stream was refreshed, False if it was already current. - Note: LiveStream refreshes automatically on access, so this - method may be a no-op for some implementations. However, it's - always safe to call if you need to control when the cache is refreshed. - """ - ... - - def invalidate(self) -> None: - """ - Mark this stream as invalid, forcing a refresh on next access. - - This method is typically called when: - - Upstream dependencies have changed - - The source kernel has been modified - - External data sources have been updated - - Manual cache invalidation is needed - - The stream will automatically refresh its content the next time - it's accessed (via iteration, as_table(), etc.). - - This is more efficient than immediate refresh when you know the - data will be accessed later. - """ - ... diff --git a/src/orcapod/protocols/core_protocols/temporal.py b/src/orcapod/protocols/core_protocols/temporal.py new file mode 100644 index 0000000..e714903 --- /dev/null +++ b/src/orcapod/protocols/core_protocols/temporal.py @@ -0,0 +1,24 @@ +from datetime import datetime +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class Temporal(Protocol): + """ + Protocol for objects that track temporal state. + + Objects implementing Temporal can report when their content + was last modified, enabling cache invalidation, incremental + processing, and dependency tracking. + """ + + @property + def last_modified(self) -> datetime | None: + """ + When this object's content was last modified. + + Returns: + datetime: Content last modified timestamp (timezone-aware) + None: Modification time unknown (assume always changed) + """ + ... diff --git a/src/orcapod/protocols/core_protocols/trackers.py b/src/orcapod/protocols/core_protocols/trackers.py index 7bc9a1e..9f3c76a 100644 --- a/src/orcapod/protocols/core_protocols/trackers.py +++ b/src/orcapod/protocols/core_protocols/trackers.py @@ -1,8 +1,9 @@ -from typing import Protocol, runtime_checkable from contextlib import AbstractContextManager -from orcapod.protocols.core_protocols.kernel import Kernel -from orcapod.protocols.core_protocols.pods import Pod -from orcapod.protocols.core_protocols.source import Source +from typing import Protocol, runtime_checkable + +from orcapod.protocols.core_protocols.packet_function import PacketFunction +from orcapod.protocols.core_protocols.pod import Pod +from orcapod.protocols.core_protocols.source_pod import SourcePod from orcapod.protocols.core_protocols.streams import Stream @@ -49,58 +50,63 @@ def is_active(self) -> bool: """ ... - def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: """ - Record a kernel invocation in the computational graph. + Record a pod invocation in the computational graph. - This method is called whenever a kernel is invoked. The tracker + This method is called whenever a pod is invoked. The tracker should record: - - The kernel and its properties + - The pod and its properties - The input streams that were used as input - Timing and performance information - Any relevant metadata Args: - kernel: The kernel that was invoked + pod: The pod that was invoked upstreams: The input streams used for this invocation """ ... - def record_source_invocation( - self, source: Source, label: str | None = None + def record_source_pod_invocation( + self, source_pod: SourcePod, label: str | None = None ) -> None: """ - Record a source invocation in the computational graph. + Record a source pod invocation in the computational graph. - This method is called whenever a source is invoked. The tracker - should record: - - The source and its properties + This method should be called to track a source pod invocation. + The tracker should record: + - The pod and its properties + - The input streams that were used as input - Timing and performance information - Any relevant metadata Args: - source: The source that was invoked + source_pod: The source pod that was invoked + label: An optional label for the invocation """ ... - def record_pod_invocation( - self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + def record_packet_function_invocation( + self, + packet_function: PacketFunction, + input_stream: Stream, + label: str | None = None, ) -> None: """ - Record a pod invocation in the computational graph. + Record a packet function invocation in the computational graph. - This method is called whenever a pod is invoked. The tracker + This method is called whenever a packet function is invoked. The tracker should record: - - The pod and its properties - - The upstream streams that were used as input + - The packet function and its properties + - The input stream that was used as input - Timing and performance information - Any relevant metadata Args: - pod: The pod that was invoked - upstreams: The input streams used for this invocation + packet_function: The packet function that was invoked + input_stream: The input stream used for this invocation """ ... @@ -163,8 +169,8 @@ def deregister_tracker(self, tracker: Tracker) -> None: """ ... - def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: """ Record a stream in all active trackers. @@ -178,8 +184,8 @@ def record_kernel_invocation( """ ... - def record_source_invocation( - self, source: Source, label: str | None = None + def record_source_pod_invocation( + self, source_pod: SourcePod, label: str | None = None ) -> None: """ Record a source invocation in the computational graph. @@ -195,18 +201,21 @@ def record_source_invocation( """ ... - def record_pod_invocation( - self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + def record_packet_function_invocation( + self, + packet_function: PacketFunction, + input_stream: Stream, + label: str | None = None, ) -> None: """ - Record a stream in all active trackers. + Record a packet function invocation in all active trackers. - This method broadcasts the stream recording to all currently` + This method broadcasts the packet function recording to all currently active and registered trackers. It provides a single point of entry for recording events, simplifying kernel implementations. Args: - stream: The stream to record in all active trackers + packet_function: The packet function to record in all active trackers """ ... diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 10719af..15f37c7 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -74,6 +74,21 @@ def display_name(self, length: int = 8) -> str: return f"{self.method}:{self.to_hex(length)}" +@runtime_checkable +class DataContextAware(Protocol): + """Protocol for objects aware of their data context.""" + + @property + def data_context_key(self) -> str: + """ + Return the data context key associated with this object. + + Returns: + str: The data context key + """ + ... + + @runtime_checkable class ContentIdentifiable(Protocol): """Protocol for objects that can provide an identity structure.""" @@ -91,7 +106,7 @@ def identity_structure(self) -> Any: def content_hash(self) -> ContentHash: """ - Compute a hash based on the content of this object. + Compute a hash based on the identity content of this object. Returns: bytes: A byte representation of the hash based on the content. diff --git a/src/orcapod/protocols/legacy_data_protocols.py b/src/orcapod/protocols/legacy_data_protocols.py deleted file mode 100644 index 53a8657..0000000 --- a/src/orcapod/protocols/legacy_data_protocols.py +++ /dev/null @@ -1,2278 +0,0 @@ -# from collections.abc import Collection, Iterator, Mapping, Callable -# from datetime import datetime -# from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING, runtime_checkable -# from orcapod.protocols.hashing_protocols import ContentIdentifiable, ContentHash -# from orcapod.types import DataValue, TypeSpec - - -# if TYPE_CHECKING: -# import pyarrow as pa -# import polars as pl -# import pandas as pd - - -# @runtime_checkable -# class ExecutionEngine(Protocol): -# @property -# def name(self) -> str: ... - -# def submit_sync(self, function: Callable, *args, **kwargs) -> Any: -# """ -# Run the given function with the provided arguments. -# This method should be implemented by the execution engine. -# """ -# ... - -# async def submit_async(self, function: Callable, *args, **kwargs) -> Any: -# """ -# Asynchronously run the given function with the provided arguments. -# This method should be implemented by the execution engine. -# """ -# ... - -# # TODO: consider adding batch submission - - -# @runtime_checkable -# class Datagram(ContentIdentifiable, Protocol): -# """ -# Protocol for immutable datagram containers in Orcapod. - -# Datagrams are the fundamental units of data that flow through the system. -# They provide a unified interface for data access, conversion, and manipulation, -# ensuring consistent behavior across different storage backends (dict, Arrow table, etc.). - -# Each datagram contains: -# - **Data columns**: The primary business data (user_id, name, etc.) -# - **Meta columns**: Internal system metadata with {orcapod.META_PREFIX} (typically '__') prefixes (e.g. __processed_at, etc.) -# - **Context column**: Data context information ({orcapod.CONTEXT_KEY}) - -# Derivative of datagram (such as Packet or Tag) will also include some specific columns pertinent to the function of the specialized datagram: -# - **Source info columns**: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) used in Packet -# - **System tags**: Internal tags for system use, typically prefixed with {orcapod.SYSTEM_TAG_PREFIX} ('_system_') (_system_created_at, etc.) used in Tag - -# All operations are by design immutable - methods return new datagram instances rather than -# modifying existing ones. - -# Example: -# >>> datagram = DictDatagram({"user_id": 123, "name": "Alice"}) -# >>> updated = datagram.update(name="Alice Smith") -# >>> filtered = datagram.select("user_id", "name") -# >>> table = datagram.as_table() -# """ - -# # 1. Core Properties (Identity & Structure) -# @property -# def data_context_key(self) -> str: -# """ -# Return the data context key for this datagram. - -# This key identifies a collection of system components that collectively controls -# how information is serialized, hashed and represented, including the semantic type registry, -# arrow data hasher, and other contextual information. Same piece of information (that is two datagrams -# with an identical *logical* content) may bear distinct internal representation if they are -# represented under two distinct data context, as signified by distinct data context keys. - -# Returns: -# str: Context key for proper datagram interpretation -# """ -# ... - -# @property -# def meta_columns(self) -> tuple[str, ...]: -# """Return tuple of meta column names (with {orcapod.META_PREFIX} ('__') prefix).""" -# ... - -# # 2. Dict-like Interface (Data Access) -# def __getitem__(self, key: str) -> DataValue: -# """ -# Get data column value by key. - -# Provides dict-like access to data columns only. Meta columns -# are not accessible through this method (use `get_meta_value()` instead). - -# Args: -# key: Data column name. - -# Returns: -# The value stored in the specified data column. - -# Raises: -# KeyError: If the column doesn't exist in data columns. - -# Example: -# >>> datagram["user_id"] -# 123 -# >>> datagram["name"] -# 'Alice' -# """ -# ... - -# def __contains__(self, key: str) -> bool: -# """ -# Check if data column exists. - -# Args: -# key: Column name to check. - -# Returns: -# True if column exists in data columns, False otherwise. - -# Example: -# >>> "user_id" in datagram -# True -# >>> "nonexistent" in datagram -# False -# """ -# ... - -# def __iter__(self) -> Iterator[str]: -# """ -# Iterate over data column names. - -# Provides for-loop support over column names, enabling natural iteration -# patterns without requiring conversion to dict. - -# Yields: -# Data column names in no particular order. - -# Example: -# >>> for column in datagram: -# ... value = datagram[column] -# ... print(f"{column}: {value}") -# """ -# ... - -# def get(self, key: str, default: DataValue = None) -> DataValue: -# """ -# Get data column value with default fallback. - -# Args: -# key: Data column name. -# default: Value to return if column doesn't exist. - -# Returns: -# Column value if exists, otherwise the default value. - -# Example: -# >>> datagram.get("user_id") -# 123 -# >>> datagram.get("missing", "default") -# 'default' -# """ -# ... - -# # 3. Structural Information -# def keys( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# ) -> tuple[str, ...]: -# """ -# Return tuple of column names. - -# Provides access to column names with filtering options for different -# column types. Default returns only data column names. - -# Args: -# include_meta_columns: Controls meta column inclusion. -# - False: Return only data column names (default) -# - True: Include all meta column names -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context column. - -# Returns: -# Tuple of column names based on inclusion criteria. - -# Example: -# >>> datagram.keys() # Data columns only -# ('user_id', 'name', 'email') -# >>> datagram.keys(include_meta_columns=True) -# ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') -# >>> datagram.keys(include_meta_columns=["pipeline"]) -# ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') -# >>> datagram.keys(include_context=True) -# ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') -# """ -# ... - -# def types( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# ) -> TypeSpec: -# """ -# Return type specification mapping field names to Python types. - -# The TypeSpec enables type checking and validation throughout the system. - -# Args: -# include_meta_columns: Controls meta column type inclusion. -# - False: Exclude meta column types (default) -# - True: Include all meta column types -# - Collection[str]: Include meta column types matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context type. - -# Returns: -# TypeSpec mapping field names to their Python types. - -# Example: -# >>> datagram.types() -# {'user_id': , 'name': } -# """ -# ... - -# def arrow_schema( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# ) -> "pa.Schema": -# """ -# Return PyArrow schema representation. - -# The schema provides structured field and type information for efficient -# serialization and deserialization with PyArrow. - -# Args: -# include_meta_columns: Controls meta column schema inclusion. -# - False: Exclude meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context column. - -# Returns: -# PyArrow Schema describing the datagram structure. - -# Example: -# >>> schema = datagram.arrow_schema() -# >>> schema.names -# ['user_id', 'name'] -# """ -# ... - -# # 4. Format Conversions (Export) -# def as_dict( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# ) -> dict[str, DataValue]: -# """ -# Convert datagram to dictionary format. - -# Provides a simple key-value representation useful for debugging, -# serialization, and interop with dict-based APIs. - -# Args: -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include the context key. -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. - - -# Returns: -# Dictionary with requested columns as key-value pairs. - -# Example: -# >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} -# >>> full_data = datagram.as_dict( -# ... include_meta_columns=True, -# ... include_context=True -# ... ) -# """ -# ... - -# def as_table( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# ) -> "pa.Table": -# """ -# Convert datagram to PyArrow Table format. - -# Provides a standardized columnar representation suitable for analysis, -# processing, and interoperability with Arrow-based tools. - -# Args: -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include the context column. -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. - -# Returns: -# PyArrow Table with requested columns. - -# Example: -# >>> table = datagram.as_table() # Data columns only -# >>> full_table = datagram.as_table( -# ... include_meta_columns=True, -# ... include_context=True -# ... ) -# >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" -# """ -# ... - -# def as_arrow_compatible_dict( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# ) -> dict[str, Any]: -# """ -# Return dictionary with values optimized for Arrow table conversion. - -# This method returns a dictionary where values are in a form that can be -# efficiently converted to Arrow format using pa.Table.from_pylist(). - -# The key insight is that this avoids the expensive as_table() → concat pattern -# by providing values that are "Arrow-ready" while remaining in dict format -# for efficient batching. - -# Implementation note: This may involve format conversions (e.g., Path objects -# to strings, datetime objects to ISO strings, etc.) to ensure compatibility -# with Arrow's expected input formats. - -# Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied -# with arrow_schema(...) with the same argument options to ensure that the schema matches the table. - -# Args: -# include_all_info: Include all available information -# include_meta_columns: Controls meta column inclusion -# include_context: Whether to include context key - -# Returns: -# Dictionary with values optimized for Arrow conversion - -# Example: -# # Efficient batch conversion pattern -# arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] -# schema = datagrams[0].arrow_schema() -# table = pa.Table.from_pylist(arrow_dicts, schema=schema) -# """ -# ... - -# # 5. Meta Column Operations -# def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: -# """ -# Get meta column value with optional default. - -# Meta columns store operational metadata and use {orcapod.META_PREFIX} ('__') prefixes. -# This method handles both prefixed and unprefixed key formats. - -# Args: -# key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). -# default: Value to return if meta column doesn't exist. - -# Returns: -# Meta column value if exists, otherwise the default value. - -# Example: -# >>> datagram.get_meta_value("pipeline_version") # Auto-prefixed -# 'v2.1.0' -# >>> datagram.get_meta_value("__pipeline_version") # Already prefixed -# 'v2.1.0' -# >>> datagram.get_meta_value("missing", "default") -# 'default' -# """ -# ... - -# def with_meta_columns(self, **updates: DataValue) -> Self: -# """ -# Create new datagram with updated meta columns. - -# Adds or updates operational metadata while preserving all data columns. -# Keys are automatically prefixed with {orcapod.META_PREFIX} ('__') if needed. - -# Args: -# **updates: Meta column updates as keyword arguments. - -# Returns: -# New datagram instance with updated meta columns. - -# Example: -# >>> tracked = datagram.with_meta_columns( -# ... processed_by="pipeline_v2", -# ... timestamp="2024-01-15T10:30:00Z" -# ... ) -# """ -# ... - -# def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: -# """ -# Create new datagram with specified meta columns removed. - -# Args: -# *keys: Meta column keys to remove (prefixes optional). -# ignore_missing: If True, ignore missing columns without raising an error. - - -# Returns: -# New datagram instance without specified meta columns. - -# Raises: -# KeryError: If any specified meta column to drop doesn't exist and ignore_missing=False. - -# Example: -# >>> cleaned = datagram.drop_meta_columns("old_source", "temp_debug") -# """ -# ... - -# # 6. Data Column Operations -# def select(self, *column_names: str) -> Self: -# """ -# Create new datagram with only specified data columns. - -# Args: -# *column_names: Data column names to keep. - - -# Returns: -# New datagram instance with only specified data columns. All other columns including -# meta columns and context are preserved. - -# Raises: -# KeyError: If any specified column doesn't exist. - -# Example: -# >>> subset = datagram.select("user_id", "name", "email") -# """ -# ... - -# def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: -# """ -# Create new datagram with specified data columns removed. Note that this does not -# remove meta columns or context column. Refer to `drop_meta_columns()` for dropping -# specific meta columns. Context key column can never be dropped but a modified copy -# can be created with a different context key using `with_data_context()`. - -# Args: -# *column_names: Data column names to remove. -# ignore_missing: If True, ignore missing columns without raising an error. - -# Returns: -# New datagram instance without specified data columns. - -# Raises: -# KeryError: If any specified column to drop doesn't exist and ignore_missing=False. - -# Example: -# >>> filtered = datagram.drop("temp_field", "debug_info") -# """ -# ... - -# def rename( -# self, -# column_mapping: Mapping[str, str], -# ) -> Self: -# """ -# Create new datagram with data columns renamed. - -# Args: -# column_mapping: Mapping from old names to new names. - -# Returns: -# New datagram instance with renamed data columns. - -# Example: -# >>> renamed = datagram.rename( -# ... {"old_id": "user_id", "old_name": "full_name"}, -# ... column_types={"user_id": int} -# ... ) -# """ -# ... - -# def update(self, **updates: DataValue) -> Self: -# """ -# Create new datagram with existing column values updated. - -# Updates values in existing data columns. Will error if any specified -# column doesn't exist - use with_columns() to add new columns. - -# Args: -# **updates: Column names and their new values. - -# Returns: -# New datagram instance with updated values. - -# Raises: -# KeyError: If any specified column doesn't exist. - -# Example: -# >>> updated = datagram.update( -# ... file_path="/new/absolute/path.txt", -# ... status="processed" -# ... ) -# """ -# ... - -# def with_columns( -# self, -# column_types: Mapping[str, type] | None = None, -# **updates: DataValue, -# ) -> Self: -# """ -# Create new datagram with additional data columns. - -# Adds new data columns to the datagram. Will error if any specified -# column already exists - use update() to modify existing columns. - -# Args: -# column_types: Optional type specifications for new columns. If not provided, the column type is -# inferred from the provided values. If value is None, the column type defaults to `str`. -# **kwargs: New columns as keyword arguments. - -# Returns: -# New datagram instance with additional data columns. - -# Raises: -# ValueError: If any specified column already exists. - -# Example: -# >>> expanded = datagram.with_columns( -# ... status="active", -# ... score=95.5, -# ... column_types={"score": float} -# ... ) -# """ -# ... - -# # 7. Context Operations -# def with_context_key(self, new_context_key: str) -> Self: -# """ -# Create new datagram with different context key. - -# Changes the semantic interpretation context while preserving all data. -# The context key affects how columns are processed and converted. - -# Args: -# new_context_key: New context key string. - -# Returns: -# New datagram instance with updated context key. - -# Note: -# How the context is interpreted depends on the datagram implementation. -# Semantic processing may be rebuilt for the new context. - -# Example: -# >>> financial_datagram = datagram.with_context_key("financial_v1") -# """ -# ... - -# # 8. Utility Operations -# def copy(self) -> Self: -# """ -# Create a shallow copy of the datagram. - -# Returns a new datagram instance with the same data and cached values. -# This is more efficient than reconstructing from scratch when you need -# an identical datagram instance. - -# Returns: -# New datagram instance with copied data and caches. - -# Example: -# >>> copied = datagram.copy() -# >>> copied is datagram # False - different instance -# False -# """ -# ... - -# # 9. String Representations -# def __str__(self) -> str: -# """ -# Return user-friendly string representation. - -# Shows the datagram as a simple dictionary for user-facing output, -# messages, and logging. Only includes data columns for clean output. - -# Returns: -# Dictionary-style string representation of data columns only. -# """ -# ... - -# def __repr__(self) -> str: -# """ -# Return detailed string representation for debugging. - -# Shows the datagram type and comprehensive information for debugging. - -# Returns: -# Detailed representation with type and metadata information. -# """ -# ... - - -# @runtime_checkable -# class Tag(Datagram, Protocol): -# """ -# Metadata associated with each data item in a stream. - -# Tags carry contextual information about data packets as they flow through -# the computational graph. They are immutable and provide metadata that -# helps with: -# - Data lineage tracking -# - Grouping and aggregation operations -# - Temporal information (timestamps) -# - Source identification -# - Processing context - -# Common examples include: -# - Timestamps indicating when data was created/processed -# - Source identifiers showing data origin -# - Processing metadata like batch IDs or session information -# - Grouping keys for aggregation operations -# - Quality indicators or confidence scores -# """ - -# def keys( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_system_tags: bool = False, -# ) -> tuple[str, ...]: -# """ -# Return tuple of column names. - -# Provides access to column names with filtering options for different -# column types. Default returns only data column names. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column inclusion. -# - False: Return only data column names (default) -# - True: Include all meta column names -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context column. -# include_source: Whether to include source info fields. - - -# Returns: -# Tuple of column names based on inclusion criteria. - -# Example: -# >>> datagram.keys() # Data columns only -# ('user_id', 'name', 'email') -# >>> datagram.keys(include_meta_columns=True) -# ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') -# >>> datagram.keys(include_meta_columns=["pipeline"]) -# ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') -# >>> datagram.keys(include_context=True) -# ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') -# """ -# ... - -# def types( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_system_tags: bool = False, -# ) -> TypeSpec: -# """ -# Return type specification mapping field names to Python types. - -# The TypeSpec enables type checking and validation throughout the system. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column type inclusion. -# - False: Exclude meta column types (default) -# - True: Include all meta column types -# - Collection[str]: Include meta column types matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context type. -# include_source: Whether to include source info fields. - -# Returns: -# TypeSpec mapping field names to their Python types. - -# Example: -# >>> datagram.types() -# {'user_id': , 'name': } -# """ -# ... - -# def arrow_schema( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_system_tags: bool = False, -# ) -> "pa.Schema": -# """ -# Return PyArrow schema representation. - -# The schema provides structured field and type information for efficient -# serialization and deserialization with PyArrow. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column schema inclusion. -# - False: Exclude meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context column. -# include_source: Whether to include source info fields. - - -# Returns: -# PyArrow Schema describing the datagram structure. - -# Example: -# >>> schema = datagram.arrow_schema() -# >>> schema.names -# ['user_id', 'name'] -# """ -# ... - -# def as_dict( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_system_tags: bool = False, -# ) -> dict[str, DataValue]: -# """ -# Convert datagram to dictionary format. - -# Provides a simple key-value representation useful for debugging, -# serialization, and interop with dict-based APIs. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include the context key. -# include_source: Whether to include source info fields. - - -# Returns: -# Dictionary with requested columns as key-value pairs. - -# Example: -# >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} -# >>> full_data = datagram.as_dict( -# ... include_meta_columns=True, -# ... include_context=True -# ... ) -# """ -# ... - -# def as_table( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_system_tags: bool = False, -# ) -> "pa.Table": -# """ -# Convert datagram to PyArrow Table format. - -# Provides a standardized columnar representation suitable for analysis, -# processing, and interoperability with Arrow-based tools. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include the context column. -# include_source: Whether to include source info columns in the schema. - -# Returns: -# PyArrow Table with requested columns. - -# Example: -# >>> table = datagram.as_table() # Data columns only -# >>> full_table = datagram.as_table( -# ... include_meta_columns=True, -# ... include_context=True -# ... ) -# >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" -# """ -# ... - -# # TODO: add this back -# # def as_arrow_compatible_dict( -# # self, -# # include_all_info: bool = False, -# # include_meta_columns: bool | Collection[str] = False, -# # include_context: bool = False, -# # include_source: bool = False, -# # ) -> dict[str, Any]: -# # """Extended version with source info support.""" -# # ... - -# def as_datagram( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_system_tags: bool = False, -# ) -> Datagram: -# """ -# Convert the packet to a Datagram. - -# Args: -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - -# Returns: -# Datagram: Datagram representation of packet data -# """ -# ... - -# def system_tags(self) -> dict[str, DataValue]: -# """ -# Return metadata about the packet's source/origin. - -# Provides debugging and lineage information about where the packet -# originated. May include information like: -# - File paths for file-based sources -# - Database connection strings -# - API endpoints -# - Processing pipeline information - -# Returns: -# dict[str, str | None]: Source information for each data column as key-value pairs. -# """ -# ... - - -# @runtime_checkable -# class Packet(Datagram, Protocol): -# """ -# The actual data payload in a stream. - -# Packets represent the core data being processed through the computational -# graph. Unlike Tags (which are metadata), Packets contain the actual -# information that computations operate on. - -# Packets extend Datagram with additional capabilities for: -# - Source tracking and lineage -# - Content-based hashing for caching -# - Metadata inclusion for debugging - -# The distinction between Tag and Packet is crucial for understanding -# data flow: Tags provide context, Packets provide content. -# """ - -# def keys( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_source: bool = False, -# ) -> tuple[str, ...]: -# """ -# Return tuple of column names. - -# Provides access to column names with filtering options for different -# column types. Default returns only data column names. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column inclusion. -# - False: Return only data column names (default) -# - True: Include all meta column names -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context column. -# include_source: Whether to include source info fields. - - -# Returns: -# Tuple of column names based on inclusion criteria. - -# Example: -# >>> datagram.keys() # Data columns only -# ('user_id', 'name', 'email') -# >>> datagram.keys(include_meta_columns=True) -# ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') -# >>> datagram.keys(include_meta_columns=["pipeline"]) -# ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') -# >>> datagram.keys(include_context=True) -# ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') -# """ -# ... - -# def types( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_source: bool = False, -# ) -> TypeSpec: -# """ -# Return type specification mapping field names to Python types. - -# The TypeSpec enables type checking and validation throughout the system. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column type inclusion. -# - False: Exclude meta column types (default) -# - True: Include all meta column types -# - Collection[str]: Include meta column types matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context type. -# include_source: Whether to include source info fields. - -# Returns: -# TypeSpec mapping field names to their Python types. - -# Example: -# >>> datagram.types() -# {'user_id': , 'name': } -# """ -# ... - -# def arrow_schema( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_source: bool = False, -# ) -> "pa.Schema": -# """ -# Return PyArrow schema representation. - -# The schema provides structured field and type information for efficient -# serialization and deserialization with PyArrow. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column schema inclusion. -# - False: Exclude meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include context column. -# include_source: Whether to include source info fields. - - -# Returns: -# PyArrow Schema describing the datagram structure. - -# Example: -# >>> schema = datagram.arrow_schema() -# >>> schema.names -# ['user_id', 'name'] -# """ -# ... - -# def as_dict( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_source: bool = False, -# ) -> dict[str, DataValue]: -# """ -# Convert datagram to dictionary format. - -# Provides a simple key-value representation useful for debugging, -# serialization, and interop with dict-based APIs. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include the context key. -# include_source: Whether to include source info fields. - - -# Returns: -# Dictionary with requested columns as key-value pairs. - -# Example: -# >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} -# >>> full_data = datagram.as_dict( -# ... include_meta_columns=True, -# ... include_context=True -# ... ) -# """ -# ... - -# def as_table( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_context: bool = False, -# include_source: bool = False, -# ) -> "pa.Table": -# """ -# Convert datagram to PyArrow Table format. - -# Provides a standardized columnar representation suitable for analysis, -# processing, and interoperability with Arrow-based tools. - -# Args: -# include_all_info: If True, include all available information. This option supersedes all other inclusion options. -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. -# include_context: Whether to include the context column. -# include_source: Whether to include source info columns in the schema. - -# Returns: -# PyArrow Table with requested columns. - -# Example: -# >>> table = datagram.as_table() # Data columns only -# >>> full_table = datagram.as_table( -# ... include_meta_columns=True, -# ... include_context=True -# ... ) -# >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" -# """ -# ... - -# # TODO: add this back -# # def as_arrow_compatible_dict( -# # self, -# # include_all_info: bool = False, -# # include_meta_columns: bool | Collection[str] = False, -# # include_context: bool = False, -# # include_source: bool = False, -# # ) -> dict[str, Any]: -# # """Extended version with source info support.""" -# # ... - -# def as_datagram( -# self, -# include_all_info: bool = False, -# include_meta_columns: bool | Collection[str] = False, -# include_source: bool = False, -# ) -> Datagram: -# """ -# Convert the packet to a Datagram. - -# Args: -# include_meta_columns: Controls meta column inclusion. -# - False: Exclude all meta columns (default) -# - True: Include all meta columns -# - Collection[str]: Include meta columns matching these prefixes. If absent, -# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - -# Returns: -# Datagram: Datagram representation of packet data -# """ -# ... - -# def source_info(self) -> dict[str, str | None]: -# """ -# Return metadata about the packet's source/origin. - -# Provides debugging and lineage information about where the packet -# originated. May include information like: -# - File paths for file-based sources -# - Database connection strings -# - API endpoints -# - Processing pipeline information - -# Returns: -# dict[str, str | None]: Source information for each data column as key-value pairs. -# """ -# ... - -# def with_source_info( -# self, -# **source_info: str | None, -# ) -> Self: -# """ -# Create new packet with updated source information. - -# Adds or updates source metadata for the packet. This is useful for -# tracking data provenance and lineage through the computational graph. - -# Args: -# **source_info: Source metadata as keyword arguments. - -# Returns: -# New packet instance with updated source information. - -# Example: -# >>> updated_packet = packet.with_source_info( -# ... file_path="/new/path/to/file.txt", -# ... source_id="source_123" -# ... ) -# """ -# ... - - -# @runtime_checkable -# class PodFunction(Protocol): -# """ -# A function suitable for use in a FunctionPod. - -# PodFunctions define the computational logic that operates on individual -# packets within a Pod. They represent pure functions that transform -# data values without side effects. - -# These functions are designed to be: -# - Stateless: No dependency on external state -# - Deterministic: Same inputs always produce same outputs -# - Serializable: Can be cached and distributed -# - Type-safe: Clear input/output contracts - -# PodFunctions accept named arguments corresponding to packet fields -# and return transformed data values. -# """ - -# def __call__(self, **kwargs: DataValue) -> None | DataValue: -# """ -# Execute the pod function with the given arguments. - -# The function receives packet data as named arguments and returns -# either transformed data or None (for filtering operations). - -# Args: -# **kwargs: Named arguments mapping packet fields to data values - -# Returns: -# None: Filter out this packet (don't include in output) -# DataValue: Single transformed value - -# Raises: -# TypeError: If required arguments are missing -# ValueError: If argument values are invalid -# """ -# ... - - -# @runtime_checkable -# class Labelable(Protocol): -# """ -# Protocol for objects that can have a human-readable label. - -# Labels provide meaningful names for objects in the computational graph, -# making debugging, visualization, and monitoring much easier. They serve -# as human-friendly identifiers that complement the technical identifiers -# used internally. - -# Labels are optional but highly recommended for: -# - Debugging complex computational graphs -# - Visualization and monitoring tools -# - Error messages and logging -# - User interfaces and dashboards -# """ - -# @property -# def label(self) -> str | None: -# """ -# Return the human-readable label for this object. - -# Labels should be descriptive and help users understand the purpose -# or role of the object in the computational graph. - -# Returns: -# str: Human-readable label for this object -# None: No label is set (will use default naming) -# """ -# ... - - -# @runtime_checkable -# class Stream(ContentIdentifiable, Labelable, Protocol): -# """ -# Base protocol for all streams in Orcapod. - -# Streams represent sequences of (Tag, Packet) pairs flowing through the -# computational graph. They are the fundamental data structure connecting -# kernels and carrying both data and metadata. - -# Streams can be either: -# - Static: Immutable snapshots created at a specific point in time -# - Live: Dynamic streams that stay current with upstream dependencies - -# All streams provide: -# - Iteration over (tag, packet) pairs -# - Type information and schema access -# - Lineage information (source kernel and upstream streams) -# - Basic caching and freshness tracking -# - Conversion to common formats (tables, dictionaries) -# """ - -# @property -# def substream_identities(self) -> tuple[str, ...]: -# """ -# Unique identifiers for sub-streams within this stream. - -# This property provides a way to identify and differentiate -# sub-streams that may be part of a larger stream. It is useful -# for tracking and managing complex data flows. - -# Returns: -# tuple[str, ...]: Unique identifiers for each sub-stream -# """ -# ... - -# @property -# def execution_engine(self) -> ExecutionEngine | None: -# """ -# The execution engine attached to this stream. By default, the stream -# will use this execution engine whenever it needs to perform computation. -# None means the stream is not attached to any execution engine and will default -# to running natively. -# """ - -# @execution_engine.setter -# def execution_engine(self, engine: ExecutionEngine | None) -> None: -# """ -# Set the execution engine for this stream. - -# This allows the stream to use a specific execution engine for -# computation, enabling optimized execution strategies and resource -# management. - -# Args: -# engine: The execution engine to attach to this stream -# """ -# ... - -# def get_substream(self, substream_id: str) -> "Stream": -# """ -# Retrieve a specific sub-stream by its identifier. - -# This method allows access to individual sub-streams within the -# main stream, enabling focused operations on specific data segments. - -# Args: -# substream_id: Unique identifier for the desired sub-stream. - -# Returns: -# Stream: The requested sub-stream if it exists -# """ -# ... - -# @property -# def source(self) -> "Kernel | None": -# """ -# The kernel that produced this stream. - -# This provides lineage information for tracking data flow through -# the computational graph. Root streams (like file sources) may -# have no source kernel. - -# Returns: -# Kernel: The source kernel that created this stream -# None: This is a root stream with no source kernel -# """ -# ... - -# @property -# def upstreams(self) -> tuple["Stream", ...]: -# """ -# Input streams used to produce this stream. - -# These are the streams that were provided as input to the source -# kernel when this stream was created. Used for dependency tracking -# and cache invalidation. - -# Returns: -# tuple[Stream, ...]: Upstream dependency streams (empty for sources) -# """ -# ... - -# def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: -# """ -# Available keys/fields in the stream content. - -# Returns the field names present in both tags and packets. -# This provides schema information without requiring type details, -# useful for: -# - Schema inspection and exploration -# - Query planning and optimization -# - Field validation and mapping - -# Returns: -# tuple[tuple[str, ...], tuple[str, ...]]: (tag_keys, packet_keys) -# """ -# ... - -# def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: -# """ -# Type specifications for the stream content. - -# Returns the type schema for both tags and packets in this stream. -# This information is used for: -# - Type checking and validation -# - Schema inference and planning -# - Compatibility checking between kernels - -# Returns: -# tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) -# """ -# ... - -# @property -# def last_modified(self) -> datetime | None: -# """ -# When the stream's content was last modified. - -# This property is crucial for caching decisions and dependency tracking: -# - datetime: Content was last modified at this time (cacheable) -# - None: Content is never stable, always recompute (some dynamic streams) - -# Both static and live streams typically return datetime values, but -# live streams update this timestamp whenever their content changes. - -# Returns: -# datetime: Timestamp of last modification for most streams -# None: Stream content is never stable (some special dynamic streams) -# """ -# ... - -# @property -# def is_current(self) -> bool: -# """ -# Whether the stream is up-to-date with its dependencies. - -# A stream is current if its content reflects the latest state of its -# source kernel and upstream streams. This is used for cache validation -# and determining when refresh is needed. - -# For live streams, this should always return True since they stay -# current automatically. For static streams, this indicates whether -# the cached content is still valid. - -# Returns: -# bool: True if stream is up-to-date, False if refresh needed -# """ -# ... - -# def __iter__(self) -> Iterator[tuple[Tag, Packet]]: -# """ -# Iterate over (tag, packet) pairs in the stream. - -# This is the primary way to access stream data. The behavior depends -# on the stream type: -# - Static streams: Return cached/precomputed data -# - Live streams: May trigger computation and always reflect current state - -# Yields: -# tuple[Tag, Packet]: Sequential (tag, packet) pairs -# """ -# ... - -# def iter_packets( -# self, execution_engine: ExecutionEngine | None = None -# ) -> Iterator[tuple[Tag, Packet]]: -# """ -# Alias for __iter__ for explicit packet iteration. - -# Provides a more explicit method name when the intent is to iterate -# over packets specifically, improving code readability. - -# This method must return an immutable iterator -- that is, the returned iterator -# should not change and must consistently return identical tag,packet pairs across -# multiple iterations of the iterator. - -# Note that this is NOT to mean that multiple invocation of `iter_packets` must always -# return an identical iterator. The iterator returned by `iter_packets` may change -# between invocations, but the iterator itself must not change. Consequently, it should be understood -# that the returned iterators may be a burden on memory if the stream is large or infinite. - -# Yields: -# tuple[Tag, Packet]: Sequential (tag, packet) pairs -# """ -# ... - -# def run(self, execution_engine: ExecutionEngine | None = None) -> None: -# """ -# Execute the stream using the provided execution engine. - -# This method triggers computation of the stream content based on its -# source kernel and upstream streams. It returns a new stream instance -# containing the computed (tag, packet) pairs. - -# Args: -# execution_engine: The execution engine to use for computation - -# """ -# ... - -# async def run_async(self, execution_engine: ExecutionEngine | None = None) -> None: -# """ -# Asynchronously execute the stream using the provided execution engine. - -# This method triggers computation of the stream content based on its -# source kernel and upstream streams. It returns a new stream instance -# containing the computed (tag, packet) pairs. - -# Args: -# execution_engine: The execution engine to use for computation - -# """ -# ... - -# def as_df( -# self, -# include_data_context: bool = False, -# include_source: bool = False, -# include_system_tags: bool = False, -# include_content_hash: bool | str = False, -# execution_engine: ExecutionEngine | None = None, -# ) -> "pl.DataFrame | None": -# """ -# Convert the entire stream to a Polars DataFrame. -# """ -# ... - -# def as_table( -# self, -# include_data_context: bool = False, -# include_source: bool = False, -# include_system_tags: bool = False, -# include_content_hash: bool | str = False, -# execution_engine: ExecutionEngine | None = None, -# ) -> "pa.Table": -# """ -# Convert the entire stream to a PyArrow Table. - -# Materializes all (tag, packet) pairs into a single table for -# analysis and processing. This operation may be expensive for -# large streams or live streams that need computation. - -# If include_content_hash is True, an additional column called "_content_hash" -# containing the content hash of each packet is included. If include_content_hash -# is a string, it is used as the name of the content hash column. - -# Returns: -# pa.Table: Complete stream data as a PyArrow Table -# """ -# ... - -# def flow( -# self, execution_engine: ExecutionEngine | None = None -# ) -> Collection[tuple[Tag, Packet]]: -# """ -# Return the entire stream as a collection of (tag, packet) pairs. - -# This method materializes the stream content into a list or similar -# collection type. It is useful for small streams or when you need -# to process all data at once. - -# Args: -# execution_engine: Optional execution engine to use for computation. -# If None, the stream will use its default execution engine. -# """ -# ... - -# def join(self, other_stream: "Stream") -> "Stream": -# """ -# Join this stream with another stream. - -# Combines two streams into a single stream by merging their content. -# The resulting stream contains all (tag, packet) pairs from both -# streams, preserving their order. - -# Args: -# other_stream: The other stream to join with this one. - -# Returns: -# Self: New stream containing combined content from both streams. -# """ -# ... - -# def semi_join(self, other_stream: "Stream") -> "Stream": -# """ -# Perform a semi-join with another stream. - -# This operation filters this stream to only include packets that have -# corresponding tags in the other stream. The resulting stream contains -# all (tag, packet) pairs from this stream that match tags in the other. - -# Args: -# other_stream: The other stream to semi-join with this one. - -# Returns: -# Self: New stream containing filtered content based on the semi-join. -# """ -# ... - -# def map_tags( -# self, name_map: Mapping[str, str], drop_unmapped: bool = True -# ) -> "Stream": -# """ -# Map tag names in this stream to new names based on the provided mapping. -# """ -# ... - -# def map_packets( -# self, name_map: Mapping[str, str], drop_unmapped: bool = True -# ) -> "Stream": -# """ -# Map packet names in this stream to new names based on the provided mapping. -# """ -# ... - - -# @runtime_checkable -# class LiveStream(Stream, Protocol): -# """ -# A stream that automatically stays up-to-date with its upstream dependencies. - -# LiveStream extends the base Stream protocol with capabilities for "up-to-date" -# data flow and reactive computation. Unlike static streams which represent -# snapshots, LiveStreams provide the guarantee that their content always -# reflects the current state of their dependencies. - -# Key characteristics: -# - Automatically refresh the stream if changes in the upstreams are detected -# - Track last_modified timestamp when content changes -# - Support manual refresh triggering and invalidation -# - By design, LiveStream would return True for is_current except when auto-update fails. - -# LiveStreams are always returned by Kernel.__call__() methods, ensuring -# that normal kernel usage produces live, up-to-date results. - -# Caching behavior: -# - last_modified updates whenever content changes -# - Can be cached based on dependency timestamps -# - Invalidation happens automatically when upstreams change - -# Use cases: -# - Real-time data processing pipelines -# - Reactive user interfaces -# - Monitoring and alerting systems -# - Dynamic dashboard updates -# - Any scenario requiring current data -# """ - -# def refresh(self, force: bool = False) -> bool: -# """ -# Manually trigger a refresh of this stream's content. - -# Forces the stream to check its upstream dependencies and update -# its content if necessary. This is useful when: -# - You want to ensure the latest data before a critical operation -# - You need to force computation at a specific time -# - You're debugging data flow issues -# - You want to pre-compute results for performance -# Args: -# force: If True, always refresh even if the stream is current. -# If False, only refresh if the stream is not current. - -# Returns: -# bool: True if the stream was refreshed, False if it was already current. -# Note: LiveStream refreshes automatically on access, so this -# method may be a no-op for some implementations. However, it's -# always safe to call if you need to control when the cache is refreshed. -# """ -# ... - -# def invalidate(self) -> None: -# """ -# Mark this stream as invalid, forcing a refresh on next access. - -# This method is typically called when: -# - Upstream dependencies have changed -# - The source kernel has been modified -# - External data sources have been updated -# - Manual cache invalidation is needed - -# The stream will automatically refresh its content the next time -# it's accessed (via iteration, as_table(), etc.). - -# This is more efficient than immediate refresh when you know the -# data will be accessed later. -# """ -# ... - - -# @runtime_checkable -# class Kernel(ContentIdentifiable, Labelable, Protocol): -# """ -# The fundamental unit of computation in Orcapod. - -# Kernels are the building blocks of computational graphs, transforming -# zero, one, or more input streams into a single output stream. They -# encapsulate computation logic while providing consistent interfaces -# for validation, type checking, and execution. - -# Key design principles: -# - Immutable: Kernels don't change after creation -# - Deterministic: Same inputs always produce same outputs -# - Composable: Kernels can be chained and combined -# - Trackable: All invocations are recorded for lineage -# - Type-safe: Strong typing and validation throughout - -# Execution modes: -# - __call__(): Full-featured execution with tracking, returns LiveStream -# - forward(): Pure computation without side effects, returns Stream - -# The distinction between these modes enables both production use (with -# full tracking) and testing/debugging (without side effects). -# """ - -# @property -# def kernel_id(self) -> tuple[str, ...]: -# """ -# Return a unique identifier for this Pod. - -# The pod_id is used for caching and tracking purposes. It should -# uniquely identify the Pod's computational logic, parameters, and -# any relevant metadata that affects its behavior. - -# Returns: -# tuple[str, ...]: Unique identifier for this Pod -# """ -# ... - -# @property -# def data_context_key(self) -> str: -# """ -# Return the context key for this kernel's data processing. - -# The context key is used to interpret how data columns should be -# processed and converted. It provides semantic meaning to the data -# being processed by this kernel. - -# Returns: -# str: Context key for this kernel's data processing -# """ -# ... - -# @property -# def last_modified(self) -> datetime | None: -# """ -# When the kernel was last modified. For most kernels, this is the timestamp -# of the kernel creation. -# """ -# ... - -# def __call__( -# self, *streams: Stream, label: str | None = None, **kwargs -# ) -> LiveStream: -# """ -# Main interface for kernel invocation with full tracking and guarantees. - -# This is the primary way to invoke kernels in production. It provides -# a complete execution pipeline: -# 1. Validates input streams against kernel requirements -# 2. Registers the invocation with the computational graph -# 3. Calls forward() to perform the actual computation -# 4. Ensures the result is a LiveStream that stays current - -# The returned LiveStream automatically stays up-to-date with its -# upstream dependencies, making it suitable for real-time processing -# and reactive applications. - -# Args: -# *streams: Input streams to process (can be empty for source kernels) -# label: Optional label for this invocation (overrides kernel.label) -# **kwargs: Additional arguments for kernel configuration - -# Returns: -# LiveStream: Live stream that stays up-to-date with upstreams - -# Raises: -# ValidationError: If input streams are invalid for this kernel -# TypeMismatchError: If stream types are incompatible -# ValueError: If required arguments are missing -# """ -# ... - -# def forward(self, *streams: Stream) -> Stream: -# """ -# Perform the actual computation without side effects. - -# This method contains the core computation logic and should be -# overridden by subclasses. It performs pure computation without: -# - Registering with the computational graph -# - Performing validation (caller's responsibility) -# - Guaranteeing result type (may return static or live streams) - -# The returned stream must be accurate at the time of invocation but -# need not stay up-to-date with upstream changes. This makes forward() -# suitable for: -# - Testing and debugging -# - Batch processing where currency isn't required -# - Internal implementation details - -# Args: -# *streams: Input streams to process - -# Returns: -# Stream: Result of the computation (may be static or live) -# """ -# ... - -# def output_types( -# self, *streams: Stream, include_system_tags: bool = False -# ) -> tuple[TypeSpec, TypeSpec]: -# """ -# Determine output types without triggering computation. - -# This method performs type inference based on input stream types, -# enabling efficient type checking and stream property queries. -# It should be fast and not trigger any expensive computation. - -# Used for: -# - Pre-execution type validation -# - Query planning and optimization -# - Schema inference in complex pipelines -# - IDE support and developer tooling - -# Args: -# *streams: Input streams to analyze - -# Returns: -# tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output - -# Raises: -# ValidationError: If input types are incompatible -# TypeError: If stream types cannot be processed -# """ -# ... - -# def validate_inputs(self, *streams: Stream) -> None: -# """ -# Validate input streams, raising exceptions if incompatible. - -# This method is called automatically by __call__ before computation -# to provide fail-fast behavior. It should check: -# - Number of input streams -# - Stream types and schemas -# - Any kernel-specific requirements -# - Business logic constraints - -# The goal is to catch errors early, before expensive computation -# begins, and provide clear error messages for debugging. - -# Args: -# *streams: Input streams to validate - -# Raises: -# ValidationError: If streams are invalid for this kernel -# TypeError: If stream types are incompatible -# ValueError: If stream content violates business rules -# """ -# ... - -# def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: -# """ -# Generate a unique identity structure for this kernel and/or kernel invocation. -# When invoked without streams, it should return a structure -# that uniquely identifies the kernel itself (e.g., class name, parameters). -# When invoked with streams, it should include the identity of the streams -# to distinguish different invocations of the same kernel. - -# This structure is used for: -# - Caching and memoization -# - Debugging and error reporting -# - Tracking kernel invocations in computational graphs - -# Args: -# streams: Optional input streams for this invocation. If None, identity_structure is -# based solely on the kernel. If streams are provided, they are included in the identity -# to differentiate between different invocations of the same kernel. - -# Returns: -# Any: Unique identity structure (e.g., tuple of class name and stream identities) -# """ -# ... - - -# @runtime_checkable -# class Pod(Kernel, Protocol): -# """ -# Specialized kernel for packet-level processing with advanced caching. - -# Pods represent a different computational model from regular kernels: -# - Process data one packet at a time (enabling fine-grained parallelism) -# - Support just-in-time evaluation (computation deferred until needed) -# - Provide stricter type contracts (clear input/output schemas) -# - Enable advanced caching strategies (packet-level caching) - -# The Pod abstraction is ideal for: -# - Expensive computations that benefit from caching -# - Operations that can be parallelized at the packet level -# - Transformations with strict type contracts -# - Processing that needs to be deferred until access time -# - Functions that operate on individual data items - -# Pods use a different execution model where computation is deferred -# until results are actually needed, enabling efficient resource usage -# and fine-grained caching. -# """ - -# @property -# def version(self) -> str: ... - -# def get_record_id(self, packet: Packet, execution_engine_hash: str) -> str: ... - -# @property -# def tiered_pod_id(self) -> dict[str, str]: -# """ -# Return a dictionary representation of the tiered pod's unique identifier. -# The key is supposed to be ordered from least to most specific, allowing -# for hierarchical identification of the pod. - -# This is primarily used for tiered memoization/caching strategies. - -# Returns: -# dict[str, str]: Dictionary representation of the pod's ID -# """ -# ... - -# def input_packet_types(self) -> TypeSpec: -# """ -# TypeSpec for input packets that this Pod can process. - -# Defines the exact schema that input packets must conform to. -# Pods are typically much stricter about input types than regular -# kernels, requiring precise type matching for their packet-level -# processing functions. - -# This specification is used for: -# - Runtime type validation -# - Compile-time type checking -# - Schema inference and documentation -# - Input validation and error reporting - -# Returns: -# TypeSpec: Dictionary mapping field names to required packet types -# """ -# ... - -# def output_packet_types(self) -> TypeSpec: -# """ -# TypeSpec for output packets that this Pod produces. - -# Defines the schema of packets that will be produced by this Pod. -# This is typically determined by the Pod's computational function -# and is used for: -# - Type checking downstream kernels -# - Schema inference in complex pipelines -# - Query planning and optimization -# - Documentation and developer tooling - -# Returns: -# TypeSpec: Dictionary mapping field names to output packet types -# """ -# ... - -# async def async_call( -# self, -# tag: Tag, -# packet: Packet, -# record_id: str | None = None, -# execution_engine: ExecutionEngine | None = None, -# ) -> tuple[Tag, Packet | None]: ... - -# def call( -# self, -# tag: Tag, -# packet: Packet, -# record_id: str | None = None, -# execution_engine: ExecutionEngine | None = None, -# ) -> tuple[Tag, Packet | None]: -# """ -# Process a single packet with its associated tag. - -# This is the core method that defines the Pod's computational behavior. -# It processes one (tag, packet) pair at a time, enabling: -# - Fine-grained caching at the packet level -# - Parallelization opportunities -# - Just-in-time evaluation -# - Filtering operations (by returning None) - -# The method signature supports: -# - Tag transformation (modify metadata) -# - Packet transformation (modify content) -# - Filtering (return None to exclude packet) -# - Pass-through (return inputs unchanged) - -# Args: -# tag: Metadata associated with the packet -# packet: The data payload to process - -# Returns: -# tuple[Tag, Packet | None]: -# - Tag: Output tag (may be modified from input) -# - Packet: Processed packet, or None to filter it out - -# Raises: -# TypeError: If packet doesn't match input_packet_types -# ValueError: If packet data is invalid for processing -# """ -# ... - - -# @runtime_checkable -# class CachedPod(Pod, Protocol): -# async def async_call( -# self, -# tag: Tag, -# packet: Packet, -# record_id: str | None = None, -# execution_engine: ExecutionEngine | None = None, -# skip_cache_lookup: bool = False, -# skip_cache_insert: bool = False, -# ) -> tuple[Tag, Packet | None]: ... - -# def call( -# self, -# tag: Tag, -# packet: Packet, -# record_id: str | None = None, -# execution_engine: ExecutionEngine | None = None, -# skip_cache_lookup: bool = False, -# skip_cache_insert: bool = False, -# ) -> tuple[Tag, Packet | None]: -# """ -# Process a single packet with its associated tag. - -# This is the core method that defines the Pod's computational behavior. -# It processes one (tag, packet) pair at a time, enabling: -# - Fine-grained caching at the packet level -# - Parallelization opportunities -# - Just-in-time evaluation -# - Filtering operations (by returning None) - -# The method signature supports: -# - Tag transformation (modify metadata) -# - Packet transformation (modify content) -# - Filtering (return None to exclude packet) -# - Pass-through (return inputs unchanged) - -# Args: -# tag: Metadata associated with the packet -# packet: The data payload to process - -# Returns: -# tuple[Tag, Packet | None]: -# - Tag: Output tag (may be modified from input) -# - Packet: Processed packet, or None to filter it out - -# Raises: -# TypeError: If packet doesn't match input_packet_types -# ValueError: If packet data is invalid for processing -# """ -# ... - -# def get_all_records( -# self, include_system_columns: bool = False -# ) -> "pa.Table | None": -# """ -# Retrieve all records processed by this Pod. - -# This method returns a table containing all packets processed by the Pod, -# including metadata and system columns if requested. It is useful for: -# - Debugging and analysis -# - Auditing and data lineage tracking -# - Performance monitoring - -# Args: -# include_system_columns: Whether to include system columns in the output - -# Returns: -# pa.Table | None: A table containing all processed records, or None if no records are available -# """ -# ... - - -# @runtime_checkable -# class Source(Kernel, Stream, Protocol): -# """ -# Entry point for data into the computational graph. - -# Sources are special objects that serve dual roles: -# - As Kernels: Can be invoked to produce streams -# - As Streams: Directly provide data without upstream dependencies - -# Sources represent the roots of computational graphs and typically -# interface with external data sources. They bridge the gap between -# the outside world and the Orcapod computational model. - -# Common source types: -# - File readers (CSV, JSON, Parquet, etc.) -# - Database connections and queries -# - API endpoints and web services -# - Generated data sources (synthetic data) -# - Manual data input and user interfaces -# - Message queues and event streams - -# Sources have unique properties: -# - No upstream dependencies (upstreams is empty) -# - Can be both invoked and iterated -# - Serve as the starting point for data lineage -# - May have their own refresh/update mechanisms -# """ - -# @property -# def tag_keys(self) -> tuple[str, ...]: -# """ -# Return the keys used for the tag in the pipeline run records. -# This is used to store the run-associated tag info. -# """ -# ... - -# @property -# def packet_keys(self) -> tuple[str, ...]: -# """ -# Return the keys used for the packet in the pipeline run records. -# This is used to store the run-associated packet info. -# """ -# ... - -# def get_all_records( -# self, include_system_columns: bool = False -# ) -> "pa.Table | None": -# """ -# Retrieve all records from the source. - -# Args: -# include_system_columns: Whether to include system columns in the output - -# Returns: -# pa.Table | None: A table containing all records, or None if no records are available -# """ -# ... - -# def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": ... - -# def as_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": ... - -# def as_polars_df(self, sort_by_tags: bool = False) -> "pl.DataFrame | None": ... - -# def as_pandas_df(self, sort_by_tags: bool = False) -> "pd.DataFrame | None": ... - - -# @runtime_checkable -# class Tracker(Protocol): -# """ -# Records kernel invocations and stream creation for computational graph tracking. - -# Trackers are responsible for maintaining the computational graph by recording -# relationships between kernels, streams, and invocations. They enable: -# - Lineage tracking and data provenance -# - Caching and memoization strategies -# - Debugging and error analysis -# - Performance monitoring and optimization -# - Reproducibility and auditing - -# Multiple trackers can be active simultaneously, each serving different -# purposes (e.g., one for caching, another for debugging, another for -# monitoring). This allows for flexible and composable tracking strategies. - -# Trackers can be selectively activated/deactivated to control overhead -# and focus on specific aspects of the computational graph. -# """ - -# def set_active(self, active: bool = True) -> None: -# """ -# Set the active state of the tracker. - -# When active, the tracker will record all kernel invocations and -# stream creations. When inactive, no recording occurs, reducing -# overhead for performance-critical sections. - -# Args: -# active: True to activate recording, False to deactivate -# """ -# ... - -# def is_active(self) -> bool: -# """ -# Check if the tracker is currently recording invocations. - -# Returns: -# bool: True if tracker is active and recording, False otherwise -# """ -# ... - -# def record_kernel_invocation( -# self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None -# ) -> None: -# """ -# Record a kernel invocation in the computational graph. - -# This method is called whenever a kernel is invoked. The tracker -# should record: -# - The kernel and its properties -# - The input streams that were used as input -# - Timing and performance information -# - Any relevant metadata - -# Args: -# kernel: The kernel that was invoked -# upstreams: The input streams used for this invocation -# """ -# ... - -# def record_source_invocation( -# self, source: Source, label: str | None = None -# ) -> None: -# """ -# Record a source invocation in the computational graph. - -# This method is called whenever a source is invoked. The tracker -# should record: -# - The source and its properties -# - Timing and performance information -# - Any relevant metadata - -# Args: -# source: The source that was invoked -# """ -# ... - -# def record_pod_invocation( -# self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None -# ) -> None: -# """ -# Record a pod invocation in the computational graph. - -# This method is called whenever a pod is invoked. The tracker -# should record: -# - The pod and its properties -# - The upstream streams that were used as input -# - Timing and performance information -# - Any relevant metadata - -# Args: -# pod: The pod that was invoked -# upstreams: The input streams used for this invocation -# """ -# ... - - -# @runtime_checkable -# class TrackerManager(Protocol): -# """ -# Manages multiple trackers and coordinates their activity. - -# The TrackerManager provides a centralized way to: -# - Register and manage multiple trackers -# - Coordinate recording across all active trackers -# - Provide a single interface for graph recording -# - Enable dynamic tracker registration/deregistration - -# This design allows for: -# - Multiple concurrent tracking strategies -# - Pluggable tracking implementations -# - Easy testing and debugging (mock trackers) -# - Performance optimization (selective tracking) -# """ - -# def get_active_trackers(self) -> list[Tracker]: -# """ -# Get all currently active trackers. - -# Returns only trackers that are both registered and active, -# providing the list of trackers that will receive recording events. - -# Returns: -# list[Tracker]: List of trackers that are currently recording -# """ -# ... - -# def register_tracker(self, tracker: Tracker) -> None: -# """ -# Register a new tracker in the system. - -# The tracker will be included in future recording operations -# if it is active. Registration is separate from activation -# to allow for dynamic control of tracking overhead. - -# Args: -# tracker: The tracker to register -# """ -# ... - -# def deregister_tracker(self, tracker: Tracker) -> None: -# """ -# Remove a tracker from the system. - -# The tracker will no longer receive recording notifications -# even if it is still active. This is useful for: -# - Cleaning up temporary trackers -# - Removing failed or problematic trackers -# - Dynamic tracker management - -# Args: -# tracker: The tracker to remove -# """ -# ... - -# def record_kernel_invocation( -# self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None -# ) -> None: -# """ -# Record a stream in all active trackers. - -# This method broadcasts the stream recording to all currently -# active and registered trackers. It provides a single point -# of entry for recording events, simplifying kernel implementations. - -# Args: -# stream: The stream to record in all active trackers -# """ -# ... - -# def record_source_invocation( -# self, source: Source, label: str | None = None -# ) -> None: -# """ -# Record a source invocation in the computational graph. - -# This method is called whenever a source is invoked. The tracker -# should record: -# - The source and its properties -# - Timing and performance information -# - Any relevant metadata - -# Args: -# source: The source that was invoked -# """ -# ... - -# def record_pod_invocation( -# self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None -# ) -> None: -# """ -# Record a stream in all active trackers. - -# This method broadcasts the stream recording to all currently` -# active and registered trackers. It provides a single point -# of entry for recording events, simplifying kernel implementations. - -# Args: -# stream: The stream to record in all active trackers -# """ -# ... - -# def no_tracking(self) -> ContextManager[None]: ... diff --git a/src/orcapod/utils/types_utils.py b/src/orcapod/utils/schema_utils.py similarity index 97% rename from src/orcapod/utils/types_utils.py rename to src/orcapod/utils/schema_utils.py index 5c25d03..a3acf83 100644 --- a/src/orcapod/utils/types_utils.py +++ b/src/orcapod/utils/schema_utils.py @@ -1,26 +1,27 @@ # Library of functions for working with TypeSpecs and for extracting TypeSpecs from a function's signature -from collections.abc import Callable, Collection, Sequence, Mapping -from typing import get_origin, get_args, Any -from orcapod.types import PythonSchema, PythonSchemaLike import inspect import logging import sys +from collections.abc import Callable, Collection, Mapping, Sequence +from typing import Any, get_args, get_origin + +from orcapod.types import PythonSchema, PythonSchemaLike logger = logging.getLogger(__name__) -def verify_against_typespec(packet: dict, typespec: PythonSchema) -> bool: +def verify_packet_schema(packet: dict, schema: PythonSchema) -> bool: """Verify that the dictionary's types match the expected types in the typespec.""" from beartype.door import is_bearable # verify that packet contains no keys not in typespec - if set(packet.keys()) - set(typespec.keys()): + if set(packet.keys()) - set(schema.keys()): logger.warning( - f"Packet contains keys not in typespec: {set(packet.keys()) - set(typespec.keys())}. " + f"Packet contains keys not in typespec: {set(packet.keys()) - set(schema.keys())}. " ) return False - for key, type_info in typespec.items(): + for key, type_info in schema.items(): if key not in packet: logger.warning( f"Key '{key}' not found in packet. Assuming None but this behavior may change in the future" diff --git a/tests/test_data/test_datagrams/test_arrow_datagram.py b/tests/test_data/test_datagrams/test_arrow_datagram.py index d23a4fd..5d7405e 100644 --- a/tests/test_data/test_datagrams/test_arrow_datagram.py +++ b/tests/test_data/test_datagrams/test_arrow_datagram.py @@ -19,7 +19,7 @@ from datetime import datetime, date from orcapod.core.datagrams import ArrowDatagram -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants from orcapod.protocols.core_protocols import Datagram from orcapod.protocols.hashing_protocols import ContentHash diff --git a/tests/test_data/test_datagrams/test_arrow_tag_packet.py b/tests/test_data/test_datagrams/test_arrow_tag_packet.py index 3154bdc..4a2ca01 100644 --- a/tests/test_data/test_datagrams/test_arrow_tag_packet.py +++ b/tests/test_data/test_datagrams/test_arrow_tag_packet.py @@ -14,7 +14,7 @@ from datetime import datetime, date from orcapod.core.datagrams import ArrowTag, ArrowPacket -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants class TestArrowTagInitialization: diff --git a/tests/test_data/test_datagrams/test_base_integration.py b/tests/test_data/test_datagrams/test_base_integration.py index 896a60f..4017fa0 100644 --- a/tests/test_data/test_datagrams/test_base_integration.py +++ b/tests/test_data/test_datagrams/test_base_integration.py @@ -24,7 +24,7 @@ ImmutableDict, contains_prefix_from, ) -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants class TestImmutableDict: diff --git a/tests/test_data/test_datagrams/test_dict_datagram.py b/tests/test_data/test_datagrams/test_dict_datagram.py index 5538d59..85a8e29 100644 --- a/tests/test_data/test_datagrams/test_dict_datagram.py +++ b/tests/test_data/test_datagrams/test_dict_datagram.py @@ -16,7 +16,7 @@ import pyarrow as pa from orcapod.core.datagrams import DictDatagram -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants class TestDictDatagramInitialization: diff --git a/tests/test_data/test_datagrams/test_dict_tag_packet.py b/tests/test_data/test_datagrams/test_dict_tag_packet.py index a255f79..551bd66 100644 --- a/tests/test_data/test_datagrams/test_dict_tag_packet.py +++ b/tests/test_data/test_datagrams/test_dict_tag_packet.py @@ -11,7 +11,7 @@ import pytest from orcapod.core.datagrams import DictTag, DictPacket -from orcapod.core.system_constants import constants +from orcapod.contexts.system_constants import constants class TestDictTagInitialization: From 8f98f7054dce090211d2f3a1f15ba0b7fe575f5d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 10 Nov 2025 06:22:47 +0000 Subject: [PATCH 02/11] refactor: add merged version --- src/orcapod/core/sources/base.py | 46 +++----------------------------- 1 file changed, 4 insertions(+), 42 deletions(-) diff --git a/src/orcapod/core/sources/base.py b/src/orcapod/core/sources/base.py index f5f2881..2ece7f7 100644 --- a/src/orcapod/core/sources/base.py +++ b/src/orcapod/core/sources/base.py @@ -119,13 +119,9 @@ def __iter__(self) -> Iterator[tuple[cp.Tag, cp.Packet]]: def iter_packets( self, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().iter_packets( @@ -140,13 +136,9 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main ) -> "pa.Table": """Delegate to the cached KernelStream.""" return self().as_table( @@ -161,13 +153,10 @@ def as_table( def flow( self, -<<<<<<< HEAD + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine - | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, + | None = None,, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main ) -> Collection[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().flow( @@ -178,13 +167,9 @@ def flow( def run( self, *args: Any, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main **kwargs: Any, ) -> None: """ @@ -202,13 +187,9 @@ def run( async def run_async( self, *args: Any, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main **kwargs: Any, ) -> None: """ @@ -387,13 +368,9 @@ def __iter__(self) -> Iterator[tuple[cp.Tag, cp.Packet]]: def iter_packets( self, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().iter_packets( @@ -408,13 +385,9 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main ) -> "pa.Table": """Delegate to the cached KernelStream.""" return self().as_table( @@ -429,13 +402,10 @@ def as_table( def flow( self, -<<<<<<< HEAD + execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine - | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, + | None = None,, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main ) -> Collection[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().flow( @@ -446,13 +416,9 @@ def flow( def run( self, *args: Any, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main **kwargs: Any, ) -> None: """ @@ -470,13 +436,9 @@ def run( async def run_async( self, *args: Any, -<<<<<<< HEAD execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine | None = None, -======= - execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, ->>>>>>> main **kwargs: Any, ) -> None: """ From 0a0aebb45a44fc3d9d5a3f27bbefc1260f3f8288 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 10 Nov 2025 07:02:59 +0000 Subject: [PATCH 03/11] refactor: move system constants to top of package --- src/orcapod/core/__init__.py | 2 +- src/orcapod/core/arrow_data_utils.py | 2 +- src/orcapod/core/datagrams/arrow_datagram.py | 2 +- .../core/datagrams/arrow_tag_packet.py | 2 +- src/orcapod/core/datagrams/dict_datagram.py | 2 +- src/orcapod/core/datagrams/dict_tag_packet.py | 2 +- .../core/operators/column_selection.py | 2 +- src/orcapod/core/operators/filters.py | 2 +- src/orcapod/core/operators/mappers.py | 2 +- src/orcapod/core/pods.py | 939 ++++++++++++++++++ src/orcapod/core/polars_data_utils.py | 2 +- src/orcapod/core/schema.py | 0 src/orcapod/core/streams/table_stream.py | 2 +- src/orcapod/{core => }/system_constants.py | 0 14 files changed, 950 insertions(+), 11 deletions(-) create mode 100644 src/orcapod/core/pods.py delete mode 100644 src/orcapod/core/schema.py rename src/orcapod/{core => }/system_constants.py (100%) diff --git a/src/orcapod/core/__init__.py b/src/orcapod/core/__init__.py index 1a84d7f..f483ca0 100644 --- a/src/orcapod/core/__init__.py +++ b/src/orcapod/core/__init__.py @@ -1,5 +1,5 @@ from .tracker import DEFAULT_TRACKER_MANAGER -from .system_constants import constants +from ..system_constants import constants __all__ = [ "DEFAULT_TRACKER_MANAGER", diff --git a/src/orcapod/core/arrow_data_utils.py b/src/orcapod/core/arrow_data_utils.py index 7194208..8d58da8 100644 --- a/src/orcapod/core/arrow_data_utils.py +++ b/src/orcapod/core/arrow_data_utils.py @@ -1,7 +1,7 @@ # Collection of functions to work with Arrow table data that underlies streams and/or datagrams from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from collections.abc import Collection if TYPE_CHECKING: diff --git a/src/orcapod/core/datagrams/arrow_datagram.py b/src/orcapod/core/datagrams/arrow_datagram.py index b9fb7e8..1c724ae 100644 --- a/src/orcapod/core/datagrams/arrow_datagram.py +++ b/src/orcapod/core/datagrams/arrow_datagram.py @@ -4,7 +4,7 @@ from orcapod import contexts from orcapod.core.datagrams.base import BaseDatagram -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.protocols.hashing_protocols import ContentHash from orcapod.types import DataValue, PythonSchema diff --git a/src/orcapod/core/datagrams/arrow_tag_packet.py b/src/orcapod/core/datagrams/arrow_tag_packet.py index e6d2cd1..9dc0c31 100644 --- a/src/orcapod/core/datagrams/arrow_tag_packet.py +++ b/src/orcapod/core/datagrams/arrow_tag_packet.py @@ -4,7 +4,7 @@ from orcapod import contexts from orcapod.core.datagrams.arrow_datagram import ArrowDatagram -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema diff --git a/src/orcapod/core/datagrams/dict_datagram.py b/src/orcapod/core/datagrams/dict_datagram.py index c46860e..9207708 100644 --- a/src/orcapod/core/datagrams/dict_datagram.py +++ b/src/orcapod/core/datagrams/dict_datagram.py @@ -4,7 +4,7 @@ from orcapod import contexts from orcapod.core.datagrams.base import BaseDatagram -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.protocols.hashing_protocols import ContentHash from orcapod.semantic_types import infer_python_schema_from_pylist_data diff --git a/src/orcapod/core/datagrams/dict_tag_packet.py b/src/orcapod/core/datagrams/dict_tag_packet.py index 1b20b59..cdc7854 100644 --- a/src/orcapod/core/datagrams/dict_tag_packet.py +++ b/src/orcapod/core/datagrams/dict_tag_packet.py @@ -4,7 +4,7 @@ from orcapod import contexts from orcapod.core.datagrams.dict_datagram import DictDatagram -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema, PythonSchemaLike diff --git a/src/orcapod/core/operators/column_selection.py b/src/orcapod/core/operators/column_selection.py index f37b8a4..9bea9a7 100644 --- a/src/orcapod/core/operators/column_selection.py +++ b/src/orcapod/core/operators/column_selection.py @@ -4,7 +4,7 @@ from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.errors import InputValidationError from orcapod.protocols.core_protocols import ColumnConfig, Stream from orcapod.types import PythonSchema diff --git a/src/orcapod/core/operators/filters.py b/src/orcapod/core/operators/filters.py index 4a69032..0e3bbb2 100644 --- a/src/orcapod/core/operators/filters.py +++ b/src/orcapod/core/operators/filters.py @@ -4,7 +4,7 @@ from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.errors import InputValidationError from orcapod.protocols.core_protocols import ColumnConfig, Stream from orcapod.types import PythonSchema diff --git a/src/orcapod/core/operators/mappers.py b/src/orcapod/core/operators/mappers.py index 51fd7fc..d2c2368 100644 --- a/src/orcapod/core/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -3,7 +3,7 @@ from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.errors import InputValidationError from orcapod.protocols.core_protocols import ColumnConfig, Stream from orcapod.types import PythonSchema diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/pods.py new file mode 100644 index 0000000..3cb95d5 --- /dev/null +++ b/src/orcapod/core/pods.py @@ -0,0 +1,939 @@ +import hashlib +import logging +from abc import abstractmethod +from collections.abc import Callable, Collection, Iterable, Sequence +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any, Literal, Protocol, cast + +from orcapod import contexts +from orcapod.core.datagrams import ( + ArrowPacket, + DictPacket, +) +from orcapod.utils.git_utils import get_git_info_for_python_object +from orcapod.core.kernels import KernelStream, TrackedKernelBase +from orcapod.core.operators import Join +from orcapod.core.streams import CachedPodStream, LazyPodResultStream +from orcapod.system_constants import constants +from orcapod.hashing.hash_utils import get_function_components, get_function_signature +from orcapod.protocols import core_protocols as cp +from orcapod.protocols import hashing_protocols as hp +from orcapod.protocols.database_protocols import ArrowDatabase +from orcapod.types import DataValue, PythonSchema, PythonSchemaLike +from orcapod.utils import types_utils +from orcapod.utils.lazy_module import LazyModule + + +# TODO: extract default char count as config +def combine_hashes( + *hashes: str, + order: bool = False, + prefix_hasher_id: bool = False, + hex_char_count: int | None = 20, +) -> str: + """Combine hashes into a single hash string.""" + + # Sort for deterministic order regardless of input order + if order: + prepared_hashes = sorted(hashes) + else: + prepared_hashes = list(hashes) + combined = "".join(prepared_hashes) + combined_hash = hashlib.sha256(combined.encode()).hexdigest() + if hex_char_count is not None: + combined_hash = combined_hash[:hex_char_count] + if prefix_hasher_id: + return "sha256@" + combined_hash + return combined_hash + + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + +logger = logging.getLogger(__name__) + +error_handling_options = Literal["raise", "ignore", "warn"] + + +class ActivatablePodBase(TrackedKernelBase): + """ + FunctionPod is a specialized kernel that encapsulates a function to be executed on data streams. + It allows for the execution of a function with a specific label and can be tracked by the system. + """ + + @abstractmethod + def input_packet_types(self) -> PythonSchema: + """ + Return the input typespec for the pod. This is used to validate the input streams. + """ + ... + + @abstractmethod + def output_packet_types(self) -> PythonSchema: + """ + Return the output typespec for the pod. This is used to validate the output streams. + """ + ... + + @property + def version(self) -> str: + return self._version + + @abstractmethod + def get_record_id(self, packet: cp.Packet, execution_engine_hash: str) -> str: + """ + Return the record ID for the input packet. This is used to identify the pod in the system. + """ + ... + + @property + @abstractmethod + def tiered_pod_id(self) -> dict[str, str]: + """ + Return the tiered pod ID for the pod. This is used to identify the pod in a tiered architecture. + """ + ... + + def __init__( + self, + error_handling: error_handling_options = "raise", + label: str | None = None, + version: str = "v0.0", + **kwargs, + ) -> None: + super().__init__(label=label, **kwargs) + self._active = True + self.error_handling = error_handling + self._version = version + import re + + match = re.match(r"\D.*(\d+)", version) + major_version = 0 + if match: + major_version = int(match.group(1)) + else: + raise ValueError( + f"Version string {version} does not contain a valid version number" + ) + self.skip_type_checking = False + self._major_version = major_version + + @property + def major_version(self) -> int: + return self._major_version + + def kernel_output_types( + self, *streams: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + """ + Return the input and output typespecs for the pod. + This is used to validate the input and output streams. + """ + tag_typespec, _ = streams[0].types(include_system_tags=include_system_tags) + return tag_typespec, self.output_packet_types() + + def is_active(self) -> bool: + """ + Check if the pod is active. If not, it will not process any packets. + """ + return self._active + + def set_active(self, active: bool) -> None: + """ + Set the active state of the pod. If set to False, the pod will not process any packets. + """ + self._active = active + + @staticmethod + def _join_streams(*streams: cp.Stream) -> cp.Stream: + if not streams: + raise ValueError("No streams provided for joining") + # Join the streams using a suitable join strategy + if len(streams) == 1: + return streams[0] + + joined_stream = streams[0] + for next_stream in streams[1:]: + joined_stream = Join()(joined_stream, next_stream) + return joined_stream + + def pre_kernel_processing(self, *streams: cp.Stream) -> tuple[cp.Stream, ...]: + """ + Prepare the incoming streams for execution in the pod. At least one stream must be present. + If more than one stream is present, the join of the provided streams will be returned. + """ + # if multiple streams are provided, join them + # otherwise, return as is + if len(streams) <= 1: + return streams + + output_stream = self._join_streams(*streams) + return (output_stream,) + + def validate_inputs(self, *streams: cp.Stream) -> None: + if len(streams) != 1: + raise ValueError( + f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" + ) + if self.skip_type_checking: + return + input_stream = streams[0] + _, incoming_packet_types = input_stream.types() + if not types_utils.check_typespec_compatibility( + incoming_packet_types, self.input_packet_types() + ): + # TODO: use custom exception type for better error handling + raise ValueError( + f"Incoming packet data type {incoming_packet_types} from {input_stream} is not compatible with expected input typespec {self.input_packet_types()}" + ) + + def prepare_output_stream( + self, *streams: cp.Stream, label: str | None = None + ) -> KernelStream: + return KernelStream(source=self, upstreams=streams, label=label) + + def forward(self, *streams: cp.Stream) -> cp.Stream: + assert len(streams) == 1, "PodBase.forward expects exactly one input stream" + return LazyPodResultStream(pod=self, prepared_stream=streams[0]) + + @abstractmethod + def call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: ... + + @abstractmethod + async def async_call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: ... + + def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_pod_invocation(self, streams, label=label) + + +class CallableWithPod(Protocol): + def __call__(self, *args, **kwargs) -> Any: ... + + @property + def pod(self) -> "FunctionPod": ... + + +def function_pod( + output_keys: str | Collection[str] | None = None, + function_name: str | None = None, + version: str = "v0.0", + label: str | None = None, + **kwargs, +) -> Callable[..., CallableWithPod]: + """ + Decorator that attaches FunctionPod as pod attribute. + + Args: + output_keys: Keys for the function output(s) + function_name: Name of the function pod; if None, defaults to the function name + **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. + + Returns: + CallableWithPod: Decorated function with `pod` attribute holding the FunctionPod instance + """ + + def decorator(func: Callable) -> CallableWithPod: + if func.__name__ == "": + raise ValueError("Lambda functions cannot be used with function_pod") + + # Store the original function in the module for pickling purposes + # and make sure to change the name of the function + + # Create a simple typed function pod + pod = FunctionPod( + function=func, + output_keys=output_keys, + function_name=function_name or func.__name__, + version=version, + label=label, + **kwargs, + ) + setattr(func, "pod", pod) + return cast(CallableWithPod, func) + + return decorator + + +class FunctionPod(ActivatablePodBase): + def __init__( + self, + function: cp.PodFunction, + output_keys: str | Collection[str] | None = None, + function_name=None, + version: str = "v0.0", + input_python_schema: PythonSchemaLike | None = None, + output_python_schema: PythonSchemaLike | Sequence[type] | None = None, + label: str | None = None, + function_info_extractor: hp.FunctionInfoExtractor | None = None, + **kwargs, + ) -> None: + self.function = function + + if output_keys is None: + output_keys = [] + if isinstance(output_keys, str): + output_keys = [output_keys] + self.output_keys = output_keys + if function_name is None: + if hasattr(self.function, "__name__"): + function_name = getattr(self.function, "__name__") + else: + raise ValueError( + "function_name must be provided if function has no __name__ attribute" + ) + self.function_name = function_name + # extract the first full index (potentially with leading 0) in the version string + if not isinstance(version, str): + raise TypeError(f"Version must be a string, got {type(version)}") + + super().__init__(label=label or self.function_name, version=version, **kwargs) + + # extract input and output types from the function signature + input_packet_types, output_packet_types = ( + types_utils.extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_python_schema, + output_typespec=output_python_schema, + ) + ) + + # get git info for the function + env_info = get_git_info_for_python_object(self.function) + if env_info is None: + git_hash = "unknown" + else: + git_hash = env_info.get("git_commit_hash", "unknown") + if env_info.get("git_repo_status") == "dirty": + git_hash += "-dirty" + self._git_hash = git_hash + + self._input_packet_schema = dict(input_packet_types) + self._output_packet_schema = dict(output_packet_types) + # TODO: add output packet converter for speed up + + self._function_info_extractor = function_info_extractor + object_hasher = self.data_context.object_hasher + # TODO: fix and replace with object_hasher protocol specific methods + self._function_signature_hash = object_hasher.hash_object( + get_function_signature(self.function) + ).to_string() + self._function_content_hash = object_hasher.hash_object( + get_function_components(self.function) + ).to_string() + + self._output_packet_type_hash = object_hasher.hash_object( + self.output_packet_types() + ).to_string() + + self._total_pod_id_hash = object_hasher.hash_object( + self.tiered_pod_id + ).to_string() + + @property + def tiered_pod_id(self) -> dict[str, str]: + return { + "version": self.version, + "signature": self._function_signature_hash, + "content": self._function_content_hash, + "git_hash": self._git_hash, + } + + @property + def reference(self) -> tuple[str, ...]: + return ( + self.function_name, + self._output_packet_type_hash, + "v" + str(self.major_version), + ) + + def get_record_id( + self, + packet: cp.Packet, + execution_engine_hash: str, + ) -> str: + return combine_hashes( + str(packet.content_hash()), + self._total_pod_id_hash, + execution_engine_hash, + prefix_hasher_id=True, + ) + + def input_packet_types(self) -> PythonSchema: + """ + Return the input typespec for the function pod. + This is used to validate the input streams. + """ + return self._input_packet_schema.copy() + + def output_packet_types(self) -> PythonSchema: + """ + Return the output typespec for the function pod. + This is used to validate the output streams. + """ + return self._output_packet_schema.copy() + + def __repr__(self) -> str: + return f"FunctionPod:{self.function_name}" + + def __str__(self) -> str: + include_module = self.function.__module__ != "__main__" + func_sig = get_function_signature( + self.function, + name_override=self.function_name, + include_module=include_module, + ) + return f"FunctionPod:{func_sig}" + + def call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> tuple[cp.Tag, DictPacket | None]: + if not self.is_active(): + logger.info( + f"Pod is not active: skipping computation on input packet {packet}" + ) + return tag, None + + execution_engine_hash = execution_engine.name if execution_engine else "default" + + # any kernel/pod invocation happening inside the function will NOT be tracked + if not isinstance(packet, dict): + input_dict = packet.as_dict(include_source=False) + else: + input_dict = packet + + with self._tracker_manager.no_tracking(): + if execution_engine is not None: + # use the provided execution engine to run the function + values = execution_engine.submit_sync( + self.function, + fn_kwargs=input_dict, + engine_opts=execution_engine_opts, + ) + else: + values = self.function(**input_dict) + + output_data = self.process_function_output(values) + + # TODO: extract out this function + def combine(*components: tuple[str, ...]) -> str: + inner_parsed = [":".join(component) for component in components] + return "::".join(inner_parsed) + + if record_id is None: + # if record_id is not provided, generate it from the packet + record_id = self.get_record_id(packet, execution_engine_hash) + source_info = { + k: combine(self.reference, (record_id,), (k,)) for k in output_data + } + + output_packet = DictPacket( + output_data, + source_info=source_info, + python_schema=self.output_packet_types(), + data_context=self.data_context, + ) + return tag, output_packet + + async def async_call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: + """ + Asynchronous call to the function pod. This is a placeholder for future implementation. + Currently, it behaves like the synchronous call. + """ + if not self.is_active(): + logger.info( + f"Pod is not active: skipping computation on input packet {packet}" + ) + return tag, None + + execution_engine_hash = execution_engine.name if execution_engine else "default" + + # any kernel/pod invocation happening inside the function will NOT be tracked + # with self._tracker_manager.no_tracking(): + # FIXME: figure out how to properly make context manager work with async/await + # any kernel/pod invocation happening inside the function will NOT be tracked + if not isinstance(packet, dict): + input_dict = packet.as_dict(include_source=False) + else: + input_dict = packet + if execution_engine is not None: + # use the provided execution engine to run the function + values = await execution_engine.submit_async( + self.function, fn_kwargs=input_dict, engine_opts=execution_engine_opts + ) + else: + values = self.function(**input_dict) + + output_data = self.process_function_output(values) + + # TODO: extract out this function + def combine(*components: tuple[str, ...]) -> str: + inner_parsed = [":".join(component) for component in components] + return "::".join(inner_parsed) + + if record_id is None: + # if record_id is not provided, generate it from the packet + record_id = self.get_record_id(packet, execution_engine_hash) + source_info = { + k: combine(self.reference, (record_id,), (k,)) for k in output_data + } + + output_packet = DictPacket( + output_data, + source_info=source_info, + python_schema=self.output_packet_types(), + data_context=self.data_context, + ) + return tag, output_packet + + def process_function_output(self, values: Any) -> dict[str, DataValue]: + output_values = [] + if len(self.output_keys) == 0: + output_values = [] + elif len(self.output_keys) == 1: + output_values = [values] # type: ignore + elif isinstance(values, Iterable): + output_values = list(values) # type: ignore + elif len(self.output_keys) > 1: + raise ValueError( + "Values returned by function must be a pathlike or a sequence of pathlikes" + ) + + if len(output_values) != len(self.output_keys): + raise ValueError( + f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" + ) + + return {k: v for k, v in zip(self.output_keys, output_values)} + + def kernel_identity_structure( + self, streams: Collection[cp.Stream] | None = None + ) -> Any: + id_struct = (self.__class__.__name__,) + self.reference + # if streams are provided, perform pre-processing step, validate, and add the + # resulting single stream to the identity structure + if streams is not None and len(streams) != 0: + id_struct += tuple(streams) + + return id_struct + + +class WrappedPod(ActivatablePodBase): + """ + A wrapper for an existing pod, allowing for additional functionality or modifications without changing the original pod. + This class is meant to serve as a base class for other pods that need to wrap existing pods. + Note that only the call logic is pass through to the wrapped pod, but the forward logic is not. + """ + + def __init__( + self, + pod: cp.Pod, + label: str | None = None, + data_context: str | contexts.DataContext | None = None, + **kwargs, + ) -> None: + # if data_context is not explicitly given, use that of the contained pod + if data_context is None: + data_context = pod.data_context_key + super().__init__( + label=label, + data_context=data_context, + **kwargs, + ) + self.pod = pod + + @property + def reference(self) -> tuple[str, ...]: + """ + Return the pod ID, which is the function name of the wrapped pod. + This is used to identify the pod in the system. + """ + return self.pod.reference + + def get_record_id(self, packet: cp.Packet, execution_engine_hash: str) -> str: + return self.pod.get_record_id(packet, execution_engine_hash) + + @property + def tiered_pod_id(self) -> dict[str, str]: + """ + Return the tiered pod ID for the wrapped pod. This is used to identify the pod in a tiered architecture. + """ + return self.pod.tiered_pod_id + + def computed_label(self) -> str | None: + return self.pod.label + + def input_packet_types(self) -> PythonSchema: + """ + Return the input typespec for the stored pod. + This is used to validate the input streams. + """ + return self.pod.input_packet_types() + + def output_packet_types(self) -> PythonSchema: + """ + Return the output typespec for the stored pod. + This is used to validate the output streams. + """ + return self.pod.output_packet_types() + + def validate_inputs(self, *streams: cp.Stream) -> None: + self.pod.validate_inputs(*streams) + + def call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: + return self.pod.call( + tag, + packet, + record_id=record_id, + execution_engine=execution_engine, + execution_engine_opts=execution_engine_opts, + ) + + async def async_call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: + return await self.pod.async_call( + tag, + packet, + record_id=record_id, + execution_engine=execution_engine, + execution_engine_opts=execution_engine_opts, + ) + + def kernel_identity_structure( + self, streams: Collection[cp.Stream] | None = None + ) -> Any: + return self.pod.identity_structure(streams) + + def __repr__(self) -> str: + return f"WrappedPod({self.pod!r})" + + def __str__(self) -> str: + return f"WrappedPod:{self.pod!s}" + + +class CachedPod(WrappedPod): + """ + A pod that caches the results of the wrapped pod. + This is useful for pods that are expensive to compute and can benefit from caching. + """ + + # name of the column in the tag store that contains the packet hash + DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" + + def __init__( + self, + pod: cp.Pod, + result_database: ArrowDatabase, + record_path_prefix: tuple[str, ...] = (), + match_tier: str | None = None, + retrieval_mode: Literal["latest", "most_specific"] = "latest", + **kwargs, + ): + super().__init__(pod, **kwargs) + self.record_path_prefix = record_path_prefix + self.result_database = result_database + self.match_tier = match_tier + self.retrieval_mode = retrieval_mode + self.mode: Literal["production", "development"] = "production" + + def set_mode(self, mode: str) -> None: + if mode not in ("production", "development"): + raise ValueError(f"Invalid mode: {mode}") + self.mode = mode + + @property + def version(self) -> str: + return self.pod.version + + @property + def record_path(self) -> tuple[str, ...]: + """ + Return the path to the record in the result store. + This is used to store the results of the pod. + """ + return self.record_path_prefix + self.reference + + def call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[cp.Tag, cp.Packet | None]: + # TODO: consider logic for overwriting existing records + execution_engine_hash = execution_engine.name if execution_engine else "default" + if record_id is None: + record_id = self.get_record_id( + packet, execution_engine_hash=execution_engine_hash + ) + output_packet = None + if not skip_cache_lookup and self.mode == "production": + print("Checking for cache...") + output_packet = self.get_cached_output_for_packet(packet) + if output_packet is not None: + print(f"Cache hit for {packet}!") + if output_packet is None: + tag, output_packet = super().call( + tag, + packet, + record_id=record_id, + execution_engine=execution_engine, + execution_engine_opts=execution_engine_opts, + ) + if ( + output_packet is not None + and not skip_cache_insert + and self.mode == "production" + ): + self.record_packet(packet, output_packet, record_id=record_id) + + return tag, output_packet + + async def async_call( + self, + tag: cp.Tag, + packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[cp.Tag, cp.Packet | None]: + # TODO: consider logic for overwriting existing records + execution_engine_hash = execution_engine.name if execution_engine else "default" + + if record_id is None: + record_id = self.get_record_id( + packet, execution_engine_hash=execution_engine_hash + ) + output_packet = None + if not skip_cache_lookup: + output_packet = self.get_cached_output_for_packet(packet) + if output_packet is None: + tag, output_packet = await super().async_call( + tag, + packet, + record_id=record_id, + execution_engine=execution_engine, + execution_engine_opts=execution_engine_opts, + ) + if output_packet is not None and not skip_cache_insert: + self.record_packet( + packet, + output_packet, + record_id=record_id, + execution_engine=execution_engine, + execution_engine_opts=execution_engine_opts, + ) + + return tag, output_packet + + def forward(self, *streams: cp.Stream) -> cp.Stream: + assert len(streams) == 1, "PodBase.forward expects exactly one input stream" + return CachedPodStream(pod=self, input_stream=streams[0]) + + def record_packet( + self, + input_packet: cp.Packet, + output_packet: cp.Packet, + record_id: str | None = None, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + skip_duplicates: bool = False, + ) -> cp.Packet: + """ + Record the output packet against the input packet in the result store. + """ + + # TODO: consider incorporating execution_engine_opts into the record + data_table = output_packet.as_table(include_context=True, include_source=True) + + for i, (k, v) in enumerate(self.tiered_pod_id.items()): + # add the tiered pod ID to the data table + data_table = data_table.add_column( + i, + f"{constants.POD_ID_PREFIX}{k}", + pa.array([v], type=pa.large_string()), + ) + + # add the input packet hash as a column + data_table = data_table.add_column( + 0, + constants.INPUT_PACKET_HASH, + pa.array([str(input_packet.content_hash())], type=pa.large_string()), + ) + # add execution engine information + execution_engine_hash = execution_engine.name if execution_engine else "default" + data_table = data_table.append_column( + constants.EXECUTION_ENGINE, + pa.array([execution_engine_hash], type=pa.large_string()), + ) + + # add computation timestamp + timestamp = datetime.now(timezone.utc) + data_table = data_table.append_column( + constants.POD_TIMESTAMP, + pa.array([timestamp], type=pa.timestamp("us", tz="UTC")), + ) + + if record_id is None: + record_id = self.get_record_id( + input_packet, execution_engine_hash=execution_engine_hash + ) + + self.result_database.add_record( + self.record_path, + record_id, + data_table, + skip_duplicates=skip_duplicates, + ) + # if result_flag is None: + # # TODO: do more specific error handling + # raise ValueError( + # f"Failed to record packet {input_packet} in result store {self.result_store}" + # ) + # # TODO: make store return retrieved table + return output_packet + + def get_cached_output_for_packet(self, input_packet: cp.Packet) -> cp.Packet | None: + """ + Retrieve the output packet from the result store based on the input packet. + If more than one output packet is found, conflict resolution strategy + will be applied. + If the output packet is not found, return None. + """ + # result_table = self.result_store.get_record_by_id( + # self.record_path, + # self.get_entry_hash(input_packet), + # ) + + # get all records with matching the input packet hash + # TODO: add match based on match_tier if specified + constraints = {constants.INPUT_PACKET_HASH: str(input_packet.content_hash())} + if self.match_tier is not None: + constraints[f"{constants.POD_ID_PREFIX}{self.match_tier}"] = ( + self.pod.tiered_pod_id[self.match_tier] + ) + + result_table = self.result_database.get_records_with_column_value( + self.record_path, + constraints, + ) + if result_table is None or result_table.num_rows == 0: + return None + + if result_table.num_rows > 1: + logger.info( + f"Performing conflict resolution for multiple records for {input_packet.content_hash().display_name()}" + ) + if self.retrieval_mode == "latest": + result_table = result_table.sort_by( + self.DATA_RETRIEVED_FLAG, ascending=False + ).take([0]) + elif self.retrieval_mode == "most_specific": + # match by the most specific pod ID + # trying next level if not found + for k, v in reversed(self.tiered_pod_id.items()): + search_result = result_table.filter( + pc.field(f"{constants.POD_ID_PREFIX}{k}") == v + ) + if search_result.num_rows > 0: + result_table = search_result.take([0]) + break + if result_table.num_rows > 1: + logger.warning( + f"No matching record found for {input_packet.content_hash().display_name()} with tiered pod ID {self.tiered_pod_id}" + ) + result_table = result_table.sort_by( + self.DATA_RETRIEVED_FLAG, ascending=False + ).take([0]) + + else: + raise ValueError( + f"Unknown retrieval mode: {self.retrieval_mode}. Supported modes are 'latest' and 'most_specific'." + ) + + pod_id_columns = [ + f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() + ] + result_table = result_table.drop_columns(pod_id_columns) + result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + + # note that data context will be loaded from the result store + return ArrowPacket( + result_table, + meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, + ) + + def get_all_cached_outputs( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Get all records from the result store for this pod. + If include_system_columns is True, include system columns in the result. + """ + record_id_column = ( + constants.PACKET_RECORD_ID if include_system_columns else None + ) + result_table = self.result_database.get_all_records( + self.record_path, record_id_column=record_id_column + ) + if result_table is None or result_table.num_rows == 0: + return None + + if not include_system_columns: + # remove input packet hash and tiered pod ID columns + pod_id_columns = [ + f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() + ] + result_table = result_table.drop_columns(pod_id_columns) + result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + + return result_table diff --git a/src/orcapod/core/polars_data_utils.py b/src/orcapod/core/polars_data_utils.py index 07284c4..f98e68e 100644 --- a/src/orcapod/core/polars_data_utils.py +++ b/src/orcapod/core/polars_data_utils.py @@ -2,7 +2,7 @@ from collections.abc import Collection from typing import TYPE_CHECKING -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: diff --git a/src/orcapod/core/schema.py b/src/orcapod/core/schema.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/orcapod/core/streams/table_stream.py b/src/orcapod/core/streams/table_stream.py index 1581ec5..55eed9e 100644 --- a/src/orcapod/core/streams/table_stream.py +++ b/src/orcapod/core/streams/table_stream.py @@ -9,7 +9,7 @@ ArrowTag, DictTag, ) -from orcapod.core.system_constants import constants +from orcapod.system_constants import constants from orcapod.protocols.core_protocols import Pod, Tag, Packet, Stream, ColumnConfig from orcapod.types import PythonSchema diff --git a/src/orcapod/core/system_constants.py b/src/orcapod/system_constants.py similarity index 100% rename from src/orcapod/core/system_constants.py rename to src/orcapod/system_constants.py From f8748a34880290fde55a6aaf9e8a53bc8f5b0656 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 10 Nov 2025 20:59:51 +0000 Subject: [PATCH 04/11] wip: further refinement of cached packet function --- pyproject.toml | 4 +- src/orcapod/core/datagrams/arrow_datagram.py | 2 +- src/orcapod/core/datagrams/base.py | 23 + src/orcapod/core/packet_function.py | 260 +++++++++- src/orcapod/core/pods.py | 10 +- src/orcapod/core/streams/cached_pod_stream.py | 479 ++++++++++++++++++ src/orcapod/core/streams/lazy_pod_stream.py | 257 ++++++++++ src/orcapod/core/streams/table_stream.py | 7 +- .../protocols/core_protocols/datagrams.py | 11 + .../core_protocols/packet_function.py | 3 +- src/orcapod/protocols/core_protocols/pod.py | 34 +- src/orcapod/system_constants.py | 6 +- uv.lock | 156 ++++-- 13 files changed, 1182 insertions(+), 70 deletions(-) create mode 100644 src/orcapod/core/streams/cached_pod_stream.py create mode 100644 src/orcapod/core/streams/lazy_pod_stream.py diff --git a/pyproject.toml b/pyproject.toml index eb38aba..0c0462b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,9 @@ dependencies = [ "deltalake>=1.0.2", "graphviz>=0.21", "gitpython>=3.1.45", + "starfix>=0.1.3", + "pygraphviz>=1.14", + "uuid-utils>=0.11.1", ] readme = "README.md" requires-python = ">=3.11.0" @@ -55,7 +58,6 @@ dev = [ "jsonschema>=4.25.0", "minio>=7.2.16", "pyarrow-stubs>=20.0.0.20250716", - "pygraphviz>=1.14", "pyiceberg>=0.9.1", "pyright>=1.1.404", "pytest>=8.3.5", diff --git a/src/orcapod/core/datagrams/arrow_datagram.py b/src/orcapod/core/datagrams/arrow_datagram.py index 1c724ae..2399e56 100644 --- a/src/orcapod/core/datagrams/arrow_datagram.py +++ b/src/orcapod/core/datagrams/arrow_datagram.py @@ -76,7 +76,7 @@ def __init__( The input table is automatically split into data, meta, and context components based on column naming conventions. """ - super().__init__() + super().__init__(**kwargs) # Validate table has exactly one row for datagram if len(table) != 1: diff --git a/src/orcapod/core/datagrams/base.py b/src/orcapod/core/datagrams/base.py index 653f283..9495fac 100644 --- a/src/orcapod/core/datagrams/base.py +++ b/src/orcapod/core/datagrams/base.py @@ -20,6 +20,9 @@ from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping from typing import TYPE_CHECKING, Any, Self, TypeAlias +from uuid import UUID + +from uuid_utils import uuid7 from orcapod.core.base import ContentIdentifiableBase from orcapod.protocols.core_protocols import ColumnConfig @@ -118,6 +121,22 @@ class BaseDatagram(ContentIdentifiableBase): is interpreted and used is left to concrete implementations. """ + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._uuid = None + + @property + def uuid(self) -> UUID: + """ + Return the UUID of this datagram. + + Returns: + UUID: The unique identifier for this instance of datagram. + """ + if self._uuid is None: + self._uuid = UUID(bytes=uuid7().bytes) + return self._uuid + # TODO: revisit handling of identity structure for datagrams def identity_structure(self) -> Any: raise NotImplementedError() @@ -271,4 +290,8 @@ def copy(self, include_cache: bool = True) -> Self: """Create a shallow copy of the datagram.""" new_datagram = object.__new__(self.__class__) new_datagram._data_context = self._data_context + if include_cache: + # preserve uuid if cache is preserved + # TODO: revisit this logic + new_datagram._uuid = self._uuid return new_datagram diff --git a/src/orcapod/core/packet_function.py b/src/orcapod/core/packet_function.py index ba02085..836fef8 100644 --- a/src/orcapod/core/packet_function.py +++ b/src/orcapod/core/packet_function.py @@ -7,13 +7,16 @@ from typing import TYPE_CHECKING, Any, Literal from orcapod.core.base import OrcapodBase -from orcapod.core.datagrams import DictPacket +from orcapod.core.datagrams import DictPacket, ArrowPacket from orcapod.hashing.hash_utils import get_function_components, get_function_signature -from orcapod.protocols.core_protocols import Packet +from orcapod.protocols.core_protocols import Packet, PacketFunction, Tag, Stream from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import schema_utils from orcapod.utils.git_utils import get_git_info_for_python_object from orcapod.utils.lazy_module import LazyModule +from orcapod.protocols.database_protocols import ArrowDatabase +from orcapod.system_constants import constants +from datetime import datetime, timezone def process_function_output(self, values: Any) -> dict[str, DataValue]: @@ -91,6 +94,17 @@ def __init__(self, version: str = "v0.0", **kwargs): f"Version string {version} does not contain a valid version number" ) + @property + def uri(self) -> tuple[str, ...]: + # TODO: make this more efficient + return ( + f"{self.packet_function_type_id}", + f"{self.canonical_function_name}", + self.data_context.object_hasher.hash_object( + self.output_packet_schema + ).to_string(), + ) + def identity_structure(self) -> Any: return self.get_function_variation_data() @@ -167,13 +181,6 @@ def packet_function_type_id(self) -> str: """ return "python.function.v0" - @property - def canonical_function_name(self) -> str: - """ - Human-readable function identifier - """ - return self._function_name - def __init__( self, function: Callable[..., Any], @@ -238,6 +245,13 @@ def __init__( self.output_packet_schema ).to_string() + @property + def canonical_function_name(self) -> str: + """ + Human-readable function identifier + """ + return self._function_name + def get_function_variation_data(self) -> dict[str, Any]: """Raw data defining function variation - system computes hash""" return { @@ -305,3 +319,231 @@ def call(self, packet: Packet) -> Packet | None: async def async_call(self, packet: Packet) -> Packet | None: raise NotImplementedError("Async call not implemented for synchronous function") + + +class PacketFunctionWrapper(PacketFunctionBase): + """ + Wrapper around a PacketFunction to modify or extend its behavior. + """ + + def __init__(self, packet_function: PacketFunction, **kwargs) -> None: + super().__init__(**kwargs) + self._packet_function = packet_function + + def computed_label(self) -> str | None: + return self._packet_function.label + + @property + def major_version(self) -> int: + return self._packet_function.major_version + + @property + def minor_version_string(self) -> str: + return self._packet_function.minor_version_string + + @property + def packet_function_type_id(self) -> str: + return self._packet_function.packet_function_type_id + + @property + def canonical_function_name(self) -> str: + return self._packet_function.canonical_function_name + + @property + def input_packet_schema(self) -> PythonSchema: + return self._packet_function.input_packet_schema + + @property + def output_packet_schema(self) -> PythonSchema: + return self._packet_function.output_packet_schema + + def get_function_variation_data(self) -> dict[str, Any]: + return self._packet_function.get_function_variation_data() + + def get_execution_data(self) -> dict[str, Any]: + return self._packet_function.get_execution_data() + + def call(self, packet: Packet) -> Packet | None: + return self._packet_function.call(packet) + + async def async_call(self, packet: Packet) -> Packet | None: + return await self._packet_function.async_call(packet) + + +class CachedPacketFunction(PacketFunctionWrapper): + """ + Wrapper around a PacketFunction that caches results for identical input packets. + """ + + # name of the column in the tag store that contains the packet hash + DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" + + def __init__( + self, + packet_function: PacketFunction, + result_database: ArrowDatabase, + record_path_prefix: tuple[str, ...] = (), + **kwargs, + ) -> None: + super().__init__(packet_function, **kwargs) + self._record_path_prefix = record_path_prefix + self._result_database = result_database + + @property + def record_path(self) -> tuple[str, ...]: + """ + Return the path to the record in the result store. + This is used to store the results of the pod. + """ + return self._record_path_prefix + self.uri + + def call( + self, + packet: Packet, + *, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> Packet | None: + # execution_engine_hash = execution_engine.name if execution_engine else "default" + output_packet = None + if not skip_cache_lookup: + print("Checking for cache...") + output_packet = self.get_cached_output_for_packet(packet) + if output_packet is not None: + print(f"Cache hit for {packet}!") + if output_packet is None: + output_packet = self._packet_function.call(packet) + if output_packet is not None and not skip_cache_insert: + self.record_packet(packet, output_packet) + + return output_packet + + def record_packet( + self, + input_packet: Packet, + output_packet: Packet, + skip_duplicates: bool = False, + ) -> Packet: + """ + Record the output packet against the input packet in the result store. + """ + + # TODO: consider incorporating execution_engine_opts into the record + data_table = output_packet.as_table(columns={"source": True, "context": True}) + + # for i, (k, v) in enumerate(self.tiered_pod_id.items()): + # # add the tiered pod ID to the data table + # data_table = data_table.add_column( + # i, + # f"{constants.POD_ID_PREFIX}{k}", + # pa.array([v], type=pa.large_string()), + # ) + + # add the input packet hash as a column + data_table = data_table.add_column( + 0, + constants.INPUT_PACKET_HASH_COL, + pa.array([input_packet.content_hash().to_string()], type=pa.large_string()), + ) + # # add execution engine information + # execution_engine_hash = execution_engine.name if execution_engine else "default" + # data_table = data_table.append_column( + # constants.EXECUTION_ENGINE, + # pa.array([execution_engine_hash], type=pa.large_string()), + # ) + + # add computation timestamp + timestamp = datetime.now(timezone.utc) + data_table = data_table.append_column( + constants.POD_TIMESTAMP, + pa.array([timestamp], type=pa.timestamp("us", tz="UTC")), + ) + + # if record_id is None: + # record_id = self.get_record_id( + # input_packet, execution_engine_hash=execution_engine_hash + # ) + + # self.result_database.add_record( + # self.record_path, + # record_id, + # data_table, + # skip_duplicates=skip_duplicates, + # ) + # if result_flag is None: + # # TODO: do more specific error handling + # raise ValueError( + # f"Failed to record packet {input_packet} in result store {self.result_store}" + # ) + # # TODO: make store return retrieved table + return output_packet + + def get_cached_output_for_packet(self, input_packet: Packet) -> Packet | None: + """ + Retrieve the output packet from the result store based on the input packet. + If more than one output packet is found, conflict resolution strategy + will be applied. + If the output packet is not found, return None. + """ + # result_table = self.result_store.get_record_by_id( + # self.record_path, + # self.get_entry_hash(input_packet), + # ) + + # get all records with matching the input packet hash + # TODO: add match based on match_tier if specified + + # TODO: implement matching policy/strategy + constraints = { + constants.INPUT_PACKET_HASH_COL: input_packet.content_hash().to_string() + } + + result_table = self._result_database.get_records_with_column_value( + self.record_path, + constraints, + ) + if result_table is None or result_table.num_rows == 0: + return None + + if result_table.num_rows > 1: + logger.info( + f"Performing conflict resolution for multiple records for {input_packet.content_hash().display_name()}" + ) + result_table = result_table.sort_by( + constants.POD_TIMESTAMP, ascending=False + ).take([0]) + + # result_table = result_table.drop_columns(pod_id_columns) + result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH_COL) + + # note that data context will be loaded from the result store + return ArrowPacket( + result_table, + meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, + ) + + def get_all_cached_outputs( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Get all records from the result store for this pod. + If include_system_columns is True, include system columns in the result. + """ + record_id_column = ( + constants.PACKET_RECORD_ID if include_system_columns else None + ) + result_table = self._result_database.get_all_records( + self.record_path, record_id_column=record_id_column + ) + if result_table is None or result_table.num_rows == 0: + return None + + # if not include_system_columns: + # # remove input packet hash and tiered pod ID columns + # pod_id_columns = [ + # f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() + # ] + # result_table = result_table.drop_columns(pod_id_columns) + # result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH_COL) + + return result_table diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/pods.py index 3cb95d5..3d4ca26 100644 --- a/src/orcapod/core/pods.py +++ b/src/orcapod/core/pods.py @@ -806,7 +806,7 @@ def record_packet( # add the input packet hash as a column data_table = data_table.add_column( 0, - constants.INPUT_PACKET_HASH, + constants.INPUT_PACKET_HASH_COL, pa.array([str(input_packet.content_hash())], type=pa.large_string()), ) # add execution engine information @@ -856,7 +856,9 @@ def get_cached_output_for_packet(self, input_packet: cp.Packet) -> cp.Packet | N # get all records with matching the input packet hash # TODO: add match based on match_tier if specified - constraints = {constants.INPUT_PACKET_HASH: str(input_packet.content_hash())} + constraints = { + constants.INPUT_PACKET_HASH_COL: str(input_packet.content_hash()) + } if self.match_tier is not None: constraints[f"{constants.POD_ID_PREFIX}{self.match_tier}"] = ( self.pod.tiered_pod_id[self.match_tier] @@ -904,7 +906,7 @@ def get_cached_output_for_packet(self, input_packet: cp.Packet) -> cp.Packet | N f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() ] result_table = result_table.drop_columns(pod_id_columns) - result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH_COL) # note that data context will be loaded from the result store return ArrowPacket( @@ -934,6 +936,6 @@ def get_all_cached_outputs( f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() ] result_table = result_table.drop_columns(pod_id_columns) - result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH_COL) return result_table diff --git a/src/orcapod/core/streams/cached_pod_stream.py b/src/orcapod/core/streams/cached_pod_stream.py new file mode 100644 index 0000000..172eace --- /dev/null +++ b/src/orcapod/core/streams/cached_pod_stream.py @@ -0,0 +1,479 @@ +import logging +from collections.abc import Iterator +from typing import TYPE_CHECKING, Any + +from orcapod.system_constants import constants +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import StreamBase +from orcapod.core.streams.table_stream import TableStream + + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl + +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + pl = LazyModule("polars") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class CachedPodStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + # TODO: define interface for storage or pod storage + def __init__(self, pod: cp.CachedPod, input_stream: cp.Stream, **kwargs): + super().__init__(source=pod, upstreams=(input_stream,), **kwargs) + self.pod = pod + self.input_stream = input_stream + self._set_modified_time() # set modified time to when we obtain the iterator + # capture the immutable iterator from the input stream + + self._prepared_stream_iterator = input_stream.iter_packets() + + # Packet-level caching (from your PodStream) + self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None + self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None + + def set_mode(self, mode: str) -> None: + return self.pod.set_mode(mode) + + @property + def mode(self) -> str: + return self.pod.mode + + def test(self) -> cp.Stream: + return self + + async def run_async( + self, + *args: Any, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + **kwargs: Any, + ) -> None: + """ + Runs the stream, processing the input stream and preparing the output stream. + This is typically called before iterating over the packets. + """ + if self._cached_output_packets is None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_content_hash=constants.INPUT_PACKET_HASH_COL, + include_source=True, + include_system_tags=True, + ) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH_COL]) + existing = None + else: + all_results = target_entries.join( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ), + keys=[constants.INPUT_PACKET_HASH_COL], + join_type="left outer", + right_suffix="_right", + ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH_COL]) + ) + + existing = ( + all_results.filter(pc.is_valid(pc.field("_exists"))) + .drop_columns(target_entries.column_names) + .drop_columns(["_exists"]) + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + + pending_calls = [] + if missing is not None and missing.num_rows > 0: + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + pending = self.pod.async_call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ) + pending_calls.append(pending) + import asyncio + + completed_calls = await asyncio.gather(*pending_calls) + for result in completed_calls: + cached_results.append(result) + + self._cached_output_packets = cached_results + self._set_modified_time() + + def run( + self, + *args: Any, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + **kwargs: Any, + ) -> None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_system_tags=True, + include_source=True, + include_content_hash=constants.INPUT_PACKET_HASH_COL, + execution_engine=execution_engine, + ) + existing_entries = self.pod.get_all_cached_outputs(include_system_columns=True) + if ( + existing_entries is None + or existing_entries.num_rows == 0 + or self.mode == "development" + ): + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH_COL]) + existing = None + else: + # TODO: do more proper replacement operation + target_df = pl.DataFrame(target_entries) + existing_df = pl.DataFrame( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ) + ) + all_results_df = target_df.join( + existing_df, + on=constants.INPUT_PACKET_HASH_COL, + how="left", + suffix="_right", + ) + all_results = all_results_df.to_arrow() + + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH_COL]) + ) + + existing = all_results.filter( + pc.is_valid(pc.field("_exists")) + ).drop_columns( + [ + "_exists", + constants.INPUT_PACKET_HASH_COL, + constants.PACKET_RECORD_ID, + *self.input_stream.keys()[1], # remove the input packet keys + ] + # TODO: look into NOT fetching back the record ID + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + + if missing is not None and missing.num_rows > 0: + hash_to_output_lut: dict[str, cp.Packet | None] = {} + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + packet_hash = packet.content_hash().to_string() + if packet_hash in hash_to_output_lut: + output_packet = hash_to_output_lut[packet_hash] + else: + tag, output_packet = self.pod.call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ) + # TODO: use getter for execution engine opts + hash_to_output_lut[packet_hash] = output_packet + cached_results.append((tag, output_packet)) + + self._cached_output_packets = cached_results + self._set_modified_time() + + def iter_packets( + self, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + """ + Processes the input stream and prepares the output stream. + This is typically called before iterating over the packets. + """ + if self._cached_output_packets is None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_system_tags=True, + include_source=True, + include_content_hash=constants.INPUT_PACKET_HASH_COL, + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH_COL]) + existing = None + else: + # missing = target_entries.join( + # existing_entries, + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left anti", + # ) + # Single join that gives you both missing and existing + # More efficient - only bring the key column from existing_entries + # .select([constants.INPUT_PACKET_HASH]).append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + + # TODO: do more proper replacement operation + target_df = pl.DataFrame(target_entries) + existing_df = pl.DataFrame( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ) + ) + all_results_df = target_df.join( + existing_df, + on=constants.INPUT_PACKET_HASH_COL, + how="left", + suffix="_right", + ) + all_results = all_results_df.to_arrow() + # all_results = target_entries.join( + # existing_entries.append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left outer", + # right_suffix="_right", # rename the existing records in case of collision of output packet keys with input packet keys + # ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH_COL]) + ) + + existing = all_results.filter( + pc.is_valid(pc.field("_exists")) + ).drop_columns( + [ + "_exists", + constants.INPUT_PACKET_HASH_COL, + constants.PACKET_RECORD_ID, + *self.input_stream.keys()[1], # remove the input packet keys + ] + # TODO: look into NOT fetching back the record ID + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + yield tag, packet + + if missing is not None and missing.num_rows > 0: + hash_to_output_lut: dict[str, cp.Packet | None] = {} + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + packet_hash = packet.content_hash().to_string() + if packet_hash in hash_to_output_lut: + output_packet = hash_to_output_lut[packet_hash] + else: + tag, output_packet = self.pod.call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ) + hash_to_output_lut[packet_hash] = output_packet + cached_results.append((tag, output_packet)) + if output_packet is not None: + yield tag, output_packet + + self._cached_output_packets = cached_results + self._set_modified_time() + else: + for tag, packet in self._cached_output_packets: + if packet is not None: + yield tag, packet + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, _ = self.input_stream.types( + include_system_tags=include_system_tags + ) + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> "pa.Table": + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets( + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ): + if tag_schema is None: + tag_schema = tag.arrow_schema(include_system_tags=True) + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict(include_system_tags=True)) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + all_packets.append(dict_patcket) + + converter = self.data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + struct_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + if not include_system_tags: + # TODO: come up with a more efficient approach + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + + output_table = self._cached_output_table.drop_columns(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(execution_engine=execution_engine): + content_hashes.append(packet.content_hash().to_string()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + + if sort_by_tags: + try: + # TODO: consider having explicit tag/packet properties? + output_table = output_table.sort_by( + [(column, "ascending") for column in self.keys()[0]] + ) + except pa.ArrowTypeError: + pass + + return output_table diff --git a/src/orcapod/core/streams/lazy_pod_stream.py b/src/orcapod/core/streams/lazy_pod_stream.py new file mode 100644 index 0000000..aab5b65 --- /dev/null +++ b/src/orcapod/core/streams/lazy_pod_stream.py @@ -0,0 +1,257 @@ +import logging +from collections.abc import Iterator +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from orcapod.system_constants import constants +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import StreamBase + + +if TYPE_CHECKING: + import pyarrow as pa + import polars as pl + import asyncio +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + asyncio = LazyModule("asyncio") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class LazyPodResultStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + def __init__(self, pod: cp.Pod, prepared_stream: cp.Stream, **kwargs): + super().__init__(source=pod, upstreams=(prepared_stream,), **kwargs) + self.pod = pod + self.prepared_stream = prepared_stream + # capture the immutable iterator from the prepared stream + self._prepared_stream_iterator = prepared_stream.iter_packets() + self._set_modified_time() # set modified time to AFTER we obtain the iterator + # note that the invocation of iter_packets on upstream likely triggeres the modified time + # to be updated on the usptream. Hence you want to set this stream's modified time after that. + + # Packet-level caching (from your PodStream) + self._cached_output_packets: dict[int, tuple[cp.Tag, cp.Packet | None]] = {} + self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None + + def iter_packets( + self, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + if self._prepared_stream_iterator is not None: + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i in self._cached_output_packets: + # Use cached result + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + else: + # Process packet + processed = self.pod.call( + tag, + packet, + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ) + # TODO: verify the proper use of execution engine opts + if processed is not None: + # Update shared cache for future iterators (optimization) + self._cached_output_packets[i] = processed + tag, packet = processed + if packet is not None: + yield tag, packet + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + else: + # Yield from snapshot of complete cache + for i in range(len(self._cached_output_packets)): + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + + async def run_async( + self, + *args: Any, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + **kwargs: Any, + ) -> None: + if self._prepared_stream_iterator is not None: + pending_call_lut = {} + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i not in self._cached_output_packets: + # Process packet + pending_call_lut[i] = self.pod.async_call( + tag, + packet, + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ) + + indices = list(pending_call_lut.keys()) + pending_calls = [pending_call_lut[i] for i in indices] + + results = await asyncio.gather(*pending_calls) + for i, result in zip(indices, results): + self._cached_output_packets[i] = result + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + + def run( + self, + *args: Any, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + **kwargs: Any, + ) -> None: + # Fallback to synchronous run + self.flow( + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts or self._execution_engine_opts, + ) + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.prepared_stream.keys(include_system_tags=include_system_tags) + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, _ = self.prepared_stream.types( + include_system_tags=include_system_tags + ) + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + execution_engine_opts: dict[str, Any] | None = None, + ) -> "pa.Table": + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets( + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ): + if tag_schema is None: + tag_schema = tag.arrow_schema(include_system_tags=True) + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict(include_system_tags=True)) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + all_packets.append(dict_patcket) + + # TODO: re-verify the implemetation of this conversion + converter = self.data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + struct_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_system_tags: + # TODO: get system tags more effiicently + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + + output_table = self._cached_output_table.drop(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + # TODO: verify that order will be preserved + for tag, packet in self.iter_packets( + execution_engine=execution_engine or self.execution_engine, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, + ): + content_hashes.append(packet.content_hash().to_string()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + + if sort_by_tags: + # TODO: reimplement using polars natively + output_table = ( + pl.DataFrame(output_table) + .sort(by=self.keys()[0], descending=False) + .to_arrow() + ) + # output_table = output_table.sort_by( + # [(column, "ascending") for column in self.keys()[0]] + # ) + return output_table diff --git a/src/orcapod/core/streams/table_stream.py b/src/orcapod/core/streams/table_stream.py index 55eed9e..94e498a 100644 --- a/src/orcapod/core/streams/table_stream.py +++ b/src/orcapod/core/streams/table_stream.py @@ -10,7 +10,7 @@ DictTag, ) from orcapod.system_constants import constants -from orcapod.protocols.core_protocols import Pod, Tag, Packet, Stream, ColumnConfig +from orcapod.protocols.core_protocols import Pod, Tag, Stream, ColumnConfig from orcapod.types import PythonSchema from orcapod.utils import arrow_utils @@ -19,12 +19,9 @@ if TYPE_CHECKING: import pyarrow as pa - import polars as pl - import pandas as pd else: pa = LazyModule("pyarrow") - pl = LazyModule("polars") - pd = LazyModule("pandas") + logger = logging.getLogger(__name__) diff --git a/src/orcapod/protocols/core_protocols/datagrams.py b/src/orcapod/protocols/core_protocols/datagrams.py index de80d1d..ed6d6fa 100644 --- a/src/orcapod/protocols/core_protocols/datagrams.py +++ b/src/orcapod/protocols/core_protocols/datagrams.py @@ -11,6 +11,7 @@ from orcapod.protocols.hashing_protocols import ContentIdentifiable from orcapod.types import DataType, DataValue, PythonSchema +from uuid import UUID if TYPE_CHECKING: import pyarrow as pa @@ -177,6 +178,16 @@ class Datagram(ContentIdentifiable, Protocol): >>> table = datagram.as_table() """ + @property + def uuid(self) -> UUID: + """ + Return the UUID of this datagram. + + Returns: + UUID: The unique identifier for this instance of datagram. + """ + ... + # 1. Core Properties (Identity & Structure) @property def data_context_key(self) -> str: diff --git a/src/orcapod/protocols/core_protocols/packet_function.py b/src/orcapod/protocols/core_protocols/packet_function.py index c501f01..a20e269 100644 --- a/src/orcapod/protocols/core_protocols/packet_function.py +++ b/src/orcapod/protocols/core_protocols/packet_function.py @@ -1,11 +1,12 @@ from typing import Any, Protocol, runtime_checkable from orcapod.protocols.core_protocols.datagrams import Packet +from orcapod.protocols.core_protocols.labelable import Labelable from orcapod.types import PythonSchema @runtime_checkable -class PacketFunction(Protocol): +class PacketFunction(Labelable, Protocol): """ Protocol for packet-processing function. diff --git a/src/orcapod/protocols/core_protocols/pod.py b/src/orcapod/protocols/core_protocols/pod.py index 39d947b..6b98790 100644 --- a/src/orcapod/protocols/core_protocols/pod.py +++ b/src/orcapod/protocols/core_protocols/pod.py @@ -1,7 +1,8 @@ from collections.abc import Collection from typing import Any, Protocol, TypeAlias, runtime_checkable -from orcapod.protocols.core_protocols.datagrams import ColumnConfig +from orcapod.protocols.core_protocols.packet_function import PacketFunction +from orcapod.protocols.core_protocols.datagrams import ColumnConfig, Tag, Packet from orcapod.protocols.core_protocols.labelable import Labelable from orcapod.protocols.core_protocols.streams import Stream from orcapod.protocols.core_protocols.temporal import Temporal @@ -145,3 +146,34 @@ def process(self, *streams: Stream) -> Stream: Stream: Result of the computation (may be static or live) """ ... + + +@runtime_checkable +class FunctionPod(Pod, Protocol): + """ + A Pod that represents a pure function from input streams to an output stream. + + FunctionPods have no side effects and always produce the same output + for the same inputs. They are suitable for: + - Stateless transformations + - Mathematical operations + - Data format conversions + + Because they are pure functions, FunctionPods can be: + - Cached based on input content hashes + - Parallelized across multiple inputs + - Reasoned about more easily in complex graphs + """ + + @property + def packet_function(self) -> PacketFunction: + """ + Retrieve the core packet processing function. + + This function defines the per-packet computational logic of the FunctionPod. + It is invoked for each packet in the input streams to produce output packets. + + Returns: + PodFunction: The packet processing function + """ + ... diff --git a/src/orcapod/system_constants.py b/src/orcapod/system_constants.py index 0cc5503..c52d77a 100644 --- a/src/orcapod/system_constants.py +++ b/src/orcapod/system_constants.py @@ -4,7 +4,7 @@ SOURCE_INFO_PREFIX = "source_" POD_ID_PREFIX = "pod_id_" DATA_CONTEXT_KEY = "context_key" -INPUT_PACKET_HASH = "input_packet_hash" +INPUT_PACKET_HASH_COL = "input_packet_hash" PACKET_RECORD_ID = "packet_id" SYSTEM_TAG_PREFIX = "tag" POD_VERSION = "pod_version" @@ -48,8 +48,8 @@ def POD_ID_PREFIX(self) -> str: return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{POD_ID_PREFIX}" @property - def INPUT_PACKET_HASH(self) -> str: - return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{INPUT_PACKET_HASH}" + def INPUT_PACKET_HASH_COL(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{INPUT_PACKET_HASH_COL}" @property def PACKET_RECORD_ID(self) -> str: diff --git a/uv.lock b/uv.lock index d4f48ba..7835ec7 100644 --- a/uv.lock +++ b/uv.lock @@ -1030,7 +1030,7 @@ wheels = [ [[package]] name = "ipykernel" -version = "6.30.1" +version = "7.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "appnope", marker = "sys_platform == 'darwin'" }, @@ -1047,9 +1047,9 @@ dependencies = [ { name = "tornado" }, { name = "traitlets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/bb/76/11082e338e0daadc89c8ff866185de11daf67d181901038f9e139d109761/ipykernel-6.30.1.tar.gz", hash = "sha256:6abb270161896402e76b91394fcdce5d1be5d45f456671e5080572f8505be39b", size = 166260, upload-time = "2025-08-04T15:47:35.018Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/a4/4948be6eb88628505b83a1f2f40d90254cab66abf2043b3c40fa07dfce0f/ipykernel-7.1.0.tar.gz", hash = "sha256:58a3fc88533d5930c3546dc7eac66c6d288acde4f801e2001e65edc5dc9cf0db", size = 174579, upload-time = "2025-10-27T09:46:39.471Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl", hash = "sha256:aa6b9fb93dca949069d8b85b6c79b2518e32ac583ae9c7d37c51d119e18b3fb4", size = 117484, upload-time = "2025-08-04T15:47:32.622Z" }, + { url = "https://files.pythonhosted.org/packages/a3/17/20c2552266728ceba271967b87919664ecc0e33efca29c3efc6baf88c5f9/ipykernel-7.1.0-py3-none-any.whl", hash = "sha256:763b5ec6c5b7776f6a8d7ce09b267693b4e5ce75cb50ae696aaefb3c85e1ea4c", size = 117968, upload-time = "2025-10-27T09:46:37.805Z" }, ] [[package]] @@ -1373,6 +1373,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" }, ] +[[package]] +name = "maturin" +version = "1.9.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9a/35/c3370188492f4c139c7a318f438d01b8185c216303c49c4bc885c98b6afb/maturin-1.9.6.tar.gz", hash = "sha256:2c2ae37144811d365509889ed7220b0598487f1278c2441829c3abf56cc6324a", size = 214846, upload-time = "2025-10-07T12:45:08.408Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/5c/b435418ba4ba2647a1f7a95d53314991b1e556e656ae276dea993c3bce1d/maturin-1.9.6-py3-none-linux_armv6l.whl", hash = "sha256:26e3ab1a42a7145824210e9d763f6958f2c46afb1245ddd0bab7d78b1f59bb3f", size = 8134483, upload-time = "2025-10-07T12:44:44.274Z" }, + { url = "https://files.pythonhosted.org/packages/4d/1c/8e58eda6601f328b412cdeeaa88a9b6a10e591e2a73f313e8c0154d68385/maturin-1.9.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5263dda3f71feef2e4122baf5c4620e4b3710dbb7f2121f85a337182de214369", size = 15776470, upload-time = "2025-10-07T12:44:47.476Z" }, + { url = "https://files.pythonhosted.org/packages/6c/33/8c967cce6848cdd87a2e442c86120ac644b80c5ed4c32e3291bde6a17df8/maturin-1.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fe78262c2800c92f67d1ce3c0f6463f958a692cc67bfb572e5dbf5b4b696a8ba", size = 8226557, upload-time = "2025-10-07T12:44:49.844Z" }, + { url = "https://files.pythonhosted.org/packages/58/bd/3e2675cdc8b7270700ba30c663c852a35694441732a107ac30ebd6878bd8/maturin-1.9.6-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:7ab827c6e8c022eb2e1e7fb6deede54549c8460b20ccc2e9268cc6e8cde957a8", size = 8166544, upload-time = "2025-10-07T12:44:51.396Z" }, + { url = "https://files.pythonhosted.org/packages/58/1f/a2047ddf2230e700d5f8a13dd4b9af5ce806ad380c32e58105888205926e/maturin-1.9.6-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:0246202377c49449315305209f45c8ecef6e2d6bd27a04b5b6f1ab3e4ea47238", size = 8641010, upload-time = "2025-10-07T12:44:53.658Z" }, + { url = "https://files.pythonhosted.org/packages/be/1f/265d63c7aa6faf363d4a3f23396f51bc6b4d5c7680a4190ae68dba25dea2/maturin-1.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:f5bac167700fbb6f8c8ed1a97b494522554b4432d7578e11403b894b6a91d99f", size = 7965945, upload-time = "2025-10-07T12:44:55.248Z" }, + { url = "https://files.pythonhosted.org/packages/4c/ca/a8e61979ccfe080948bcc1bddd79356157aee687134df7fb013050cec783/maturin-1.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:7f53d3b1d8396d3fea3e1ee5fd37558bca5719090f3d194ba1c02b0b56327ae3", size = 7978820, upload-time = "2025-10-07T12:44:56.919Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4a/81b412f8ad02a99801ef19ec059fba0822d1d28fb44cb6a92e722f05f278/maturin-1.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:7f506eb358386d94d6ec3208c003130cf4b69cab26034fc0cbbf8bf83afa4c2e", size = 10452064, upload-time = "2025-10-07T12:44:58.232Z" }, + { url = "https://files.pythonhosted.org/packages/5b/12/cc96c7a8cb51d8dcc9badd886c361caa1526fba7fa69d1e7892e613b71d4/maturin-1.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2d6984ab690af509f525dbd2b130714207c06ebb14a5814edbe1e42b17ae0de", size = 8852401, upload-time = "2025-10-07T12:44:59.8Z" }, + { url = "https://files.pythonhosted.org/packages/51/8e/653ac3c9f2c25cdd81aefb0a2d17ff140ca5a14504f5e3c7f94dcfe4dbb7/maturin-1.9.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5c2252b0956bb331460ac750c805ddf0d9b44442449fc1f16e3b66941689d0bc", size = 8425057, upload-time = "2025-10-07T12:45:01.711Z" }, + { url = "https://files.pythonhosted.org/packages/db/29/f13490328764ae9bfc1da55afc5b707cebe4fa75ad7a1573bfa82cfae0c6/maturin-1.9.6-py3-none-win32.whl", hash = "sha256:f2c58d29ebdd4346fd004e6be213d071fdd94a77a16aa91474a21a4f9dbf6309", size = 7165956, upload-time = "2025-10-07T12:45:03.766Z" }, + { url = "https://files.pythonhosted.org/packages/db/9f/dd51e5ac1fce47581b8efa03d77a03f928c0ef85b6e48a61dfa37b6b85a2/maturin-1.9.6-py3-none-win_amd64.whl", hash = "sha256:1b39a5d82572c240d20d9e8be024d722dfb311d330c5e28ddeb615211755941a", size = 8145722, upload-time = "2025-10-07T12:45:05.487Z" }, + { url = "https://files.pythonhosted.org/packages/65/f2/e97aaba6d0d78c5871771bf9dd71d4eb8dac15df9109cf452748d2207412/maturin-1.9.6-py3-none-win_arm64.whl", hash = "sha256:ac02a30083553d2a781c10cd6f5480119bf6692fd177e743267406cad2ad198c", size = 6857006, upload-time = "2025-10-07T12:45:06.813Z" }, +] + [[package]] name = "mdurl" version = "0.1.2" @@ -1759,8 +1780,11 @@ dependencies = [ { name = "pandas" }, { name = "polars" }, { name = "pyarrow" }, + { name = "pygraphviz" }, { name = "pyyaml" }, + { name = "starfix" }, { name = "typing-extensions" }, + { name = "uuid-utils" }, { name = "xxhash" }, ] @@ -1788,7 +1812,6 @@ dev = [ { name = "jsonschema" }, { name = "minio" }, { name = "pyarrow-stubs" }, - { name = "pygraphviz" }, { name = "pyiceberg" }, { name = "pyright" }, { name = "pytest" }, @@ -1814,10 +1837,13 @@ requires-dist = [ { name = "pandas", specifier = ">=2.2.3" }, { name = "polars", specifier = ">=1.31.0" }, { name = "pyarrow", specifier = ">=20.0.0" }, + { name = "pygraphviz", specifier = ">=1.14" }, { name = "pyyaml", specifier = ">=6.0.2" }, { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = "==2.48.0" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" }, + { name = "starfix", specifier = ">=0.1.3" }, { name = "typing-extensions" }, + { name = "uuid-utils", specifier = ">=0.11.1" }, { name = "xxhash" }, ] provides-extras = ["redis", "ray", "all"] @@ -1832,7 +1858,6 @@ dev = [ { name = "jsonschema", specifier = ">=4.25.0" }, { name = "minio", specifier = ">=7.2.16" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, - { name = "pygraphviz", specifier = ">=1.14" }, { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pyright", specifier = ">=1.1.404" }, { name = "pytest", specifier = ">=8.3.5" }, @@ -2185,46 +2210,52 @@ wheels = [ [[package]] name = "pyarrow" -version = "20.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0", size = 30856035, upload-time = "2025-04-27T12:28:40.78Z" }, - { url = "https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb", size = 32309552, upload-time = "2025-04-27T12:28:47.051Z" }, - { url = "https://files.pythonhosted.org/packages/44/fb/dfb2dfdd3e488bb14f822d7335653092dde150cffc2da97de6e7500681f9/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232", size = 41334704, upload-time = "2025-04-27T12:28:55.064Z" }, - { url = "https://files.pythonhosted.org/packages/58/0d/08a95878d38808051a953e887332d4a76bc06c6ee04351918ee1155407eb/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f", size = 42399836, upload-time = "2025-04-27T12:29:02.13Z" }, - { url = "https://files.pythonhosted.org/packages/f3/cd/efa271234dfe38f0271561086eedcad7bc0f2ddd1efba423916ff0883684/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab", size = 40711789, upload-time = "2025-04-27T12:29:09.951Z" }, - { url = "https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62", size = 42301124, upload-time = "2025-04-27T12:29:17.187Z" }, - { url = "https://files.pythonhosted.org/packages/4f/92/692c562be4504c262089e86757a9048739fe1acb4024f92d39615e7bab3f/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c", size = 42916060, upload-time = "2025-04-27T12:29:24.253Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ec/9f5c7e7c828d8e0a3c7ef50ee62eca38a7de2fa6eb1b8fa43685c9414fef/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3", size = 44547640, upload-time = "2025-04-27T12:29:32.782Z" }, - { url = "https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc", size = 25781491, upload-time = "2025-04-27T12:29:38.464Z" }, - { url = "https://files.pythonhosted.org/packages/a1/d6/0c10e0d54f6c13eb464ee9b67a68b8c71bcf2f67760ef5b6fbcddd2ab05f/pyarrow-20.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba", size = 30815067, upload-time = "2025-04-27T12:29:44.384Z" }, - { url = "https://files.pythonhosted.org/packages/7e/e2/04e9874abe4094a06fd8b0cbb0f1312d8dd7d707f144c2ec1e5e8f452ffa/pyarrow-20.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781", size = 32297128, upload-time = "2025-04-27T12:29:52.038Z" }, - { url = "https://files.pythonhosted.org/packages/31/fd/c565e5dcc906a3b471a83273039cb75cb79aad4a2d4a12f76cc5ae90a4b8/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199", size = 41334890, upload-time = "2025-04-27T12:29:59.452Z" }, - { url = "https://files.pythonhosted.org/packages/af/a9/3bdd799e2c9b20c1ea6dc6fa8e83f29480a97711cf806e823f808c2316ac/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c3a01f313ffe27ac4126f4c2e5ea0f36a5fc6ab51f8726cf41fee4b256680bd", size = 42421775, upload-time = "2025-04-27T12:30:06.875Z" }, - { url = "https://files.pythonhosted.org/packages/10/f7/da98ccd86354c332f593218101ae56568d5dcedb460e342000bd89c49cc1/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a2791f69ad72addd33510fec7bb14ee06c2a448e06b649e264c094c5b5f7ce28", size = 40687231, upload-time = "2025-04-27T12:30:13.954Z" }, - { url = "https://files.pythonhosted.org/packages/bb/1b/2168d6050e52ff1e6cefc61d600723870bf569cbf41d13db939c8cf97a16/pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4250e28a22302ce8692d3a0e8ec9d9dde54ec00d237cff4dfa9c1fbf79e472a8", size = 42295639, upload-time = "2025-04-27T12:30:21.949Z" }, - { url = "https://files.pythonhosted.org/packages/b2/66/2d976c0c7158fd25591c8ca55aee026e6d5745a021915a1835578707feb3/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:89e030dc58fc760e4010148e6ff164d2f44441490280ef1e97a542375e41058e", size = 42908549, upload-time = "2025-04-27T12:30:29.551Z" }, - { url = "https://files.pythonhosted.org/packages/31/a9/dfb999c2fc6911201dcbf348247f9cc382a8990f9ab45c12eabfd7243a38/pyarrow-20.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6102b4864d77102dbbb72965618e204e550135a940c2534711d5ffa787df2a5a", size = 44557216, upload-time = "2025-04-27T12:30:36.977Z" }, - { url = "https://files.pythonhosted.org/packages/a0/8e/9adee63dfa3911be2382fb4d92e4b2e7d82610f9d9f668493bebaa2af50f/pyarrow-20.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:96d6a0a37d9c98be08f5ed6a10831d88d52cac7b13f5287f1e0f625a0de8062b", size = 25660496, upload-time = "2025-04-27T12:30:42.809Z" }, - { url = "https://files.pythonhosted.org/packages/9b/aa/daa413b81446d20d4dad2944110dcf4cf4f4179ef7f685dd5a6d7570dc8e/pyarrow-20.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a15532e77b94c61efadde86d10957950392999503b3616b2ffcef7621a002893", size = 30798501, upload-time = "2025-04-27T12:30:48.351Z" }, - { url = "https://files.pythonhosted.org/packages/ff/75/2303d1caa410925de902d32ac215dc80a7ce7dd8dfe95358c165f2adf107/pyarrow-20.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:dd43f58037443af715f34f1322c782ec463a3c8a94a85fdb2d987ceb5658e061", size = 32277895, upload-time = "2025-04-27T12:30:55.238Z" }, - { url = "https://files.pythonhosted.org/packages/92/41/fe18c7c0b38b20811b73d1bdd54b1fccba0dab0e51d2048878042d84afa8/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa0d288143a8585806e3cc7c39566407aab646fb9ece164609dac1cfff45f6ae", size = 41327322, upload-time = "2025-04-27T12:31:05.587Z" }, - { url = "https://files.pythonhosted.org/packages/da/ab/7dbf3d11db67c72dbf36ae63dcbc9f30b866c153b3a22ef728523943eee6/pyarrow-20.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6953f0114f8d6f3d905d98e987d0924dabce59c3cda380bdfaa25a6201563b4", size = 42411441, upload-time = "2025-04-27T12:31:15.675Z" }, - { url = "https://files.pythonhosted.org/packages/90/c3/0c7da7b6dac863af75b64e2f827e4742161128c350bfe7955b426484e226/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:991f85b48a8a5e839b2128590ce07611fae48a904cae6cab1f089c5955b57eb5", size = 40677027, upload-time = "2025-04-27T12:31:24.631Z" }, - { url = "https://files.pythonhosted.org/packages/be/27/43a47fa0ff9053ab5203bb3faeec435d43c0d8bfa40179bfd076cdbd4e1c/pyarrow-20.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:97c8dc984ed09cb07d618d57d8d4b67a5100a30c3818c2fb0b04599f0da2de7b", size = 42281473, upload-time = "2025-04-27T12:31:31.311Z" }, - { url = "https://files.pythonhosted.org/packages/bc/0b/d56c63b078876da81bbb9ba695a596eabee9b085555ed12bf6eb3b7cab0e/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9b71daf534f4745818f96c214dbc1e6124d7daf059167330b610fc69b6f3d3e3", size = 42893897, upload-time = "2025-04-27T12:31:39.406Z" }, - { url = "https://files.pythonhosted.org/packages/92/ac/7d4bd020ba9145f354012838692d48300c1b8fe5634bfda886abcada67ed/pyarrow-20.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e8b88758f9303fa5a83d6c90e176714b2fd3852e776fc2d7e42a22dd6c2fb368", size = 44543847, upload-time = "2025-04-27T12:31:45.997Z" }, - { url = "https://files.pythonhosted.org/packages/9d/07/290f4abf9ca702c5df7b47739c1b2c83588641ddfa2cc75e34a301d42e55/pyarrow-20.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:30b3051b7975801c1e1d387e17c588d8ab05ced9b1e14eec57915f79869b5031", size = 25653219, upload-time = "2025-04-27T12:31:54.11Z" }, - { url = "https://files.pythonhosted.org/packages/95/df/720bb17704b10bd69dde086e1400b8eefb8f58df3f8ac9cff6c425bf57f1/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:ca151afa4f9b7bc45bcc791eb9a89e90a9eb2772767d0b1e5389609c7d03db63", size = 30853957, upload-time = "2025-04-27T12:31:59.215Z" }, - { url = "https://files.pythonhosted.org/packages/d9/72/0d5f875efc31baef742ba55a00a25213a19ea64d7176e0fe001c5d8b6e9a/pyarrow-20.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:4680f01ecd86e0dd63e39eb5cd59ef9ff24a9d166db328679e36c108dc993d4c", size = 32247972, upload-time = "2025-04-27T12:32:05.369Z" }, - { url = "https://files.pythonhosted.org/packages/d5/bc/e48b4fa544d2eea72f7844180eb77f83f2030b84c8dad860f199f94307ed/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f4c8534e2ff059765647aa69b75d6543f9fef59e2cd4c6d18015192565d2b70", size = 41256434, upload-time = "2025-04-27T12:32:11.814Z" }, - { url = "https://files.pythonhosted.org/packages/c3/01/974043a29874aa2cf4f87fb07fd108828fc7362300265a2a64a94965e35b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e1f8a47f4b4ae4c69c4d702cfbdfe4d41e18e5c7ef6f1bb1c50918c1e81c57b", size = 42353648, upload-time = "2025-04-27T12:32:20.766Z" }, - { url = "https://files.pythonhosted.org/packages/68/95/cc0d3634cde9ca69b0e51cbe830d8915ea32dda2157560dda27ff3b3337b/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:a1f60dc14658efaa927f8214734f6a01a806d7690be4b3232ba526836d216122", size = 40619853, upload-time = "2025-04-27T12:32:28.1Z" }, - { url = "https://files.pythonhosted.org/packages/29/c2/3ad40e07e96a3e74e7ed7cc8285aadfa84eb848a798c98ec0ad009eb6bcc/pyarrow-20.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:204a846dca751428991346976b914d6d2a82ae5b8316a6ed99789ebf976551e6", size = 42241743, upload-time = "2025-04-27T12:32:35.792Z" }, - { url = "https://files.pythonhosted.org/packages/eb/cb/65fa110b483339add6a9bc7b6373614166b14e20375d4daa73483755f830/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f3b117b922af5e4c6b9a9115825726cac7d8b1421c37c2b5e24fbacc8930612c", size = 42839441, upload-time = "2025-04-27T12:32:46.64Z" }, - { url = "https://files.pythonhosted.org/packages/98/7b/f30b1954589243207d7a0fbc9997401044bf9a033eec78f6cb50da3f304a/pyarrow-20.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e724a3fd23ae5b9c010e7be857f4405ed5e679db5c93e66204db1a69f733936a", size = 44503279, upload-time = "2025-04-27T12:32:56.503Z" }, - { url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload-time = "2025-04-27T12:33:04.72Z" }, +version = "22.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/b7/18f611a8cdc43417f9394a3ccd3eace2f32183c08b9eddc3d17681819f37/pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:3e294c5eadfb93d78b0763e859a0c16d4051fc1c5231ae8956d61cb0b5666f5a", size = 34272022, upload-time = "2025-10-24T10:04:28.973Z" }, + { url = "https://files.pythonhosted.org/packages/26/5c/f259e2526c67eb4b9e511741b19870a02363a47a35edbebc55c3178db22d/pyarrow-22.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:69763ab2445f632d90b504a815a2a033f74332997052b721002298ed6de40f2e", size = 35995834, upload-time = "2025-10-24T10:04:35.467Z" }, + { url = "https://files.pythonhosted.org/packages/50/8d/281f0f9b9376d4b7f146913b26fac0aa2829cd1ee7e997f53a27411bbb92/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:b41f37cabfe2463232684de44bad753d6be08a7a072f6a83447eeaf0e4d2a215", size = 45030348, upload-time = "2025-10-24T10:04:43.366Z" }, + { url = "https://files.pythonhosted.org/packages/f5/e5/53c0a1c428f0976bf22f513d79c73000926cb00b9c138d8e02daf2102e18/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35ad0f0378c9359b3f297299c3309778bb03b8612f987399a0333a560b43862d", size = 47699480, upload-time = "2025-10-24T10:04:51.486Z" }, + { url = "https://files.pythonhosted.org/packages/95/e1/9dbe4c465c3365959d183e6345d0a8d1dc5b02ca3f8db4760b3bc834cf25/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8382ad21458075c2e66a82a29d650f963ce51c7708c7c0ff313a8c206c4fd5e8", size = 48011148, upload-time = "2025-10-24T10:04:59.585Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b4/7caf5d21930061444c3cf4fa7535c82faf5263e22ce43af7c2759ceb5b8b/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1a812a5b727bc09c3d7ea072c4eebf657c2f7066155506ba31ebf4792f88f016", size = 50276964, upload-time = "2025-10-24T10:05:08.175Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f3/cec89bd99fa3abf826f14d4e53d3d11340ce6f6af4d14bdcd54cd83b6576/pyarrow-22.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ec5d40dd494882704fb876c16fa7261a69791e784ae34e6b5992e977bd2e238c", size = 28106517, upload-time = "2025-10-24T10:05:14.314Z" }, + { url = "https://files.pythonhosted.org/packages/af/63/ba23862d69652f85b615ca14ad14f3bcfc5bf1b99ef3f0cd04ff93fdad5a/pyarrow-22.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:bea79263d55c24a32b0d79c00a1c58bb2ee5f0757ed95656b01c0fb310c5af3d", size = 34211578, upload-time = "2025-10-24T10:05:21.583Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d0/f9ad86fe809efd2bcc8be32032fa72e8b0d112b01ae56a053006376c5930/pyarrow-22.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:12fe549c9b10ac98c91cf791d2945e878875d95508e1a5d14091a7aaa66d9cf8", size = 35989906, upload-time = "2025-10-24T10:05:29.485Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a8/f910afcb14630e64d673f15904ec27dd31f1e009b77033c365c84e8c1e1d/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:334f900ff08ce0423407af97e6c26ad5d4e3b0763645559ece6fbf3747d6a8f5", size = 45021677, upload-time = "2025-10-24T10:05:38.274Z" }, + { url = "https://files.pythonhosted.org/packages/13/95/aec81f781c75cd10554dc17a25849c720d54feafb6f7847690478dcf5ef8/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c6c791b09c57ed76a18b03f2631753a4960eefbbca80f846da8baefc6491fcfe", size = 47726315, upload-time = "2025-10-24T10:05:47.314Z" }, + { url = "https://files.pythonhosted.org/packages/bb/d4/74ac9f7a54cfde12ee42734ea25d5a3c9a45db78f9def949307a92720d37/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c3200cb41cdbc65156e5f8c908d739b0dfed57e890329413da2748d1a2cd1a4e", size = 47990906, upload-time = "2025-10-24T10:05:58.254Z" }, + { url = "https://files.pythonhosted.org/packages/2e/71/fedf2499bf7a95062eafc989ace56572f3343432570e1c54e6599d5b88da/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ac93252226cf288753d8b46280f4edf3433bf9508b6977f8dd8526b521a1bbb9", size = 50306783, upload-time = "2025-10-24T10:06:08.08Z" }, + { url = "https://files.pythonhosted.org/packages/68/ed/b202abd5a5b78f519722f3d29063dda03c114711093c1995a33b8e2e0f4b/pyarrow-22.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:44729980b6c50a5f2bfcc2668d36c569ce17f8b17bccaf470c4313dcbbf13c9d", size = 27972883, upload-time = "2025-10-24T10:06:14.204Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d6/d0fac16a2963002fc22c8fa75180a838737203d558f0ed3b564c4a54eef5/pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e6e95176209257803a8b3d0394f21604e796dadb643d2f7ca21b66c9c0b30c9a", size = 34204629, upload-time = "2025-10-24T10:06:20.274Z" }, + { url = "https://files.pythonhosted.org/packages/c6/9c/1d6357347fbae062ad3f17082f9ebc29cc733321e892c0d2085f42a2212b/pyarrow-22.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:001ea83a58024818826a9e3f89bf9310a114f7e26dfe404a4c32686f97bd7901", size = 35985783, upload-time = "2025-10-24T10:06:27.301Z" }, + { url = "https://files.pythonhosted.org/packages/ff/c0/782344c2ce58afbea010150df07e3a2f5fdad299cd631697ae7bd3bac6e3/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ce20fe000754f477c8a9125543f1936ea5b8867c5406757c224d745ed033e691", size = 45020999, upload-time = "2025-10-24T10:06:35.387Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8b/5362443737a5307a7b67c1017c42cd104213189b4970bf607e05faf9c525/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e0a15757fccb38c410947df156f9749ae4a3c89b2393741a50521f39a8cf202a", size = 47724601, upload-time = "2025-10-24T10:06:43.551Z" }, + { url = "https://files.pythonhosted.org/packages/69/4d/76e567a4fc2e190ee6072967cb4672b7d9249ac59ae65af2d7e3047afa3b/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cedb9dd9358e4ea1d9bce3665ce0797f6adf97ff142c8e25b46ba9cdd508e9b6", size = 48001050, upload-time = "2025-10-24T10:06:52.284Z" }, + { url = "https://files.pythonhosted.org/packages/01/5e/5653f0535d2a1aef8223cee9d92944cb6bccfee5cf1cd3f462d7cb022790/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:252be4a05f9d9185bb8c18e83764ebcfea7185076c07a7a662253af3a8c07941", size = 50307877, upload-time = "2025-10-24T10:07:02.405Z" }, + { url = "https://files.pythonhosted.org/packages/2d/f8/1d0bd75bf9328a3b826e24a16e5517cd7f9fbf8d34a3184a4566ef5a7f29/pyarrow-22.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:a4893d31e5ef780b6edcaf63122df0f8d321088bb0dee4c8c06eccb1ca28d145", size = 27977099, upload-time = "2025-10-24T10:08:07.259Z" }, + { url = "https://files.pythonhosted.org/packages/90/81/db56870c997805bf2b0f6eeeb2d68458bf4654652dccdcf1bf7a42d80903/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", size = 34336685, upload-time = "2025-10-24T10:07:11.47Z" }, + { url = "https://files.pythonhosted.org/packages/1c/98/0727947f199aba8a120f47dfc229eeb05df15bcd7a6f1b669e9f882afc58/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ba95112d15fd4f1105fb2402c4eab9068f0554435e9b7085924bcfaac2cc306f", size = 36032158, upload-time = "2025-10-24T10:07:18.626Z" }, + { url = "https://files.pythonhosted.org/packages/96/b4/9babdef9c01720a0785945c7cf550e4acd0ebcd7bdd2e6f0aa7981fa85e2/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:c064e28361c05d72eed8e744c9605cbd6d2bb7481a511c74071fd9b24bc65d7d", size = 44892060, upload-time = "2025-10-24T10:07:26.002Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ca/2f8804edd6279f78a37062d813de3f16f29183874447ef6d1aadbb4efa0f/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6f9762274496c244d951c819348afbcf212714902742225f649cf02823a6a10f", size = 47504395, upload-time = "2025-10-24T10:07:34.09Z" }, + { url = "https://files.pythonhosted.org/packages/b9/f0/77aa5198fd3943682b2e4faaf179a674f0edea0d55d326d83cb2277d9363/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a9d9ffdc2ab696f6b15b4d1f7cec6658e1d788124418cb30030afbae31c64746", size = 48066216, upload-time = "2025-10-24T10:07:43.528Z" }, + { url = "https://files.pythonhosted.org/packages/79/87/a1937b6e78b2aff18b706d738c9e46ade5bfcf11b294e39c87706a0089ac/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ec1a15968a9d80da01e1d30349b2b0d7cc91e96588ee324ce1b5228175043e95", size = 50288552, upload-time = "2025-10-24T10:07:53.519Z" }, + { url = "https://files.pythonhosted.org/packages/60/ae/b5a5811e11f25788ccfdaa8f26b6791c9807119dffcf80514505527c384c/pyarrow-22.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:bba208d9c7decf9961998edf5c65e3ea4355d5818dd6cd0f6809bec1afb951cc", size = 28262504, upload-time = "2025-10-24T10:08:00.932Z" }, + { url = "https://files.pythonhosted.org/packages/bd/b0/0fa4d28a8edb42b0a7144edd20befd04173ac79819547216f8a9f36f9e50/pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:9bddc2cade6561f6820d4cd73f99a0243532ad506bc510a75a5a65a522b2d74d", size = 34224062, upload-time = "2025-10-24T10:08:14.101Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a8/7a719076b3c1be0acef56a07220c586f25cd24de0e3f3102b438d18ae5df/pyarrow-22.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e70ff90c64419709d38c8932ea9fe1cc98415c4f87ea8da81719e43f02534bc9", size = 35990057, upload-time = "2025-10-24T10:08:21.842Z" }, + { url = "https://files.pythonhosted.org/packages/89/3c/359ed54c93b47fb6fe30ed16cdf50e3f0e8b9ccfb11b86218c3619ae50a8/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:92843c305330aa94a36e706c16209cd4df274693e777ca47112617db7d0ef3d7", size = 45068002, upload-time = "2025-10-24T10:08:29.034Z" }, + { url = "https://files.pythonhosted.org/packages/55/fc/4945896cc8638536ee787a3bd6ce7cec8ec9acf452d78ec39ab328efa0a1/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:6dda1ddac033d27421c20d7a7943eec60be44e0db4e079f33cc5af3b8280ccde", size = 47737765, upload-time = "2025-10-24T10:08:38.559Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5e/7cb7edeb2abfaa1f79b5d5eb89432356155c8426f75d3753cbcb9592c0fd/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:84378110dd9a6c06323b41b56e129c504d157d1a983ce8f5443761eb5256bafc", size = 48048139, upload-time = "2025-10-24T10:08:46.784Z" }, + { url = "https://files.pythonhosted.org/packages/88/c6/546baa7c48185f5e9d6e59277c4b19f30f48c94d9dd938c2a80d4d6b067c/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:854794239111d2b88b40b6ef92aa478024d1e5074f364033e73e21e3f76b25e0", size = 50314244, upload-time = "2025-10-24T10:08:55.771Z" }, + { url = "https://files.pythonhosted.org/packages/3c/79/755ff2d145aafec8d347bf18f95e4e81c00127f06d080135dfc86aea417c/pyarrow-22.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:b883fe6fd85adad7932b3271c38ac289c65b7337c2c132e9569f9d3940620730", size = 28757501, upload-time = "2025-10-24T10:09:59.891Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d2/237d75ac28ced3147912954e3c1a174df43a95f4f88e467809118a8165e0/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7a820d8ae11facf32585507c11f04e3f38343c1e784c9b5a8b1da5c930547fe2", size = 34355506, upload-time = "2025-10-24T10:09:02.953Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/733dfffe6d3069740f98e57ff81007809067d68626c5faef293434d11bd6/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:c6ec3675d98915bf1ec8b3c7986422682f7232ea76cad276f4c8abd5b7319b70", size = 36047312, upload-time = "2025-10-24T10:09:10.334Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2b/29d6e3782dc1f299727462c1543af357a0f2c1d3c160ce199950d9ca51eb/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3e739edd001b04f654b166204fc7a9de896cf6007eaff33409ee9e50ceaff754", size = 45081609, upload-time = "2025-10-24T10:09:18.61Z" }, + { url = "https://files.pythonhosted.org/packages/8d/42/aa9355ecc05997915af1b7b947a7f66c02dcaa927f3203b87871c114ba10/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7388ac685cab5b279a41dfe0a6ccd99e4dbf322edfb63e02fc0443bf24134e91", size = 47703663, upload-time = "2025-10-24T10:09:27.369Z" }, + { url = "https://files.pythonhosted.org/packages/ee/62/45abedde480168e83a1de005b7b7043fd553321c1e8c5a9a114425f64842/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f633074f36dbc33d5c05b5dc75371e5660f1dbf9c8b1d95669def05e5425989c", size = 48066543, upload-time = "2025-10-24T10:09:34.908Z" }, + { url = "https://files.pythonhosted.org/packages/84/e9/7878940a5b072e4f3bf998770acafeae13b267f9893af5f6d4ab3904b67e/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4c19236ae2402a8663a2c8f21f1870a03cc57f0bef7e4b6eb3238cc82944de80", size = 50288838, upload-time = "2025-10-24T10:09:44.394Z" }, + { url = "https://files.pythonhosted.org/packages/7b/03/f335d6c52b4a4761bcc83499789a1e2e16d9d201a58c327a9b5cc9a41bd9/pyarrow-22.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0c34fe18094686194f204a3b1787a27456897d8a2d62caf84b61e8dfbc0252ae", size = 29185594, upload-time = "2025-10-24T10:09:53.111Z" }, ] [[package]] @@ -3033,6 +3064,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, ] +[[package]] +name = "starfix" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ipykernel" }, + { name = "maturin" }, + { name = "pyarrow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dd/73/942a97c83a54ec1f641af1c2c8ff15c8ad5e1955d66f56c5437ef6e5c18e/starfix-0.1.3.tar.gz", hash = "sha256:4ac9090e24374dd3d4af466d04bdf6a9fe180ac8fd902b94b29f263d58803b5e", size = 18254, upload-time = "2025-10-29T19:53:23.657Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/be/98ca0482cdb4fa25a11a4dbc59c4d2a643bd8210c6c3305b2d58b5e0460c/starfix-0.1.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ef86702f0d0c8cd37b00cf63aeb6a555832eb24d7853cbe84316473ac38992d8", size = 469719, upload-time = "2025-10-29T19:53:22.473Z" }, + { url = "https://files.pythonhosted.org/packages/94/bf/208c8307d9f005ee9e6709e15bc6fff40c77293c31a8539324dddde8e783/starfix-0.1.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b0c713211ea8b293dbb4f172ca648a7b78481603c47d729c87126c867ed5b5a5", size = 598464, upload-time = "2025-10-29T19:53:21.126Z" }, +] + [[package]] name = "strictyaml" version = "1.7.3" @@ -3172,6 +3218,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680, upload-time = "2025-04-10T15:23:37.377Z" }, ] +[[package]] +name = "uuid-utils" +version = "0.11.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e2/ef/b6c1fd4fee3b2854bf9d602530ab8b6624882e2691c15a9c4d22ea8c03eb/uuid_utils-0.11.1.tar.gz", hash = "sha256:7ef455547c2ccb712840b106b5ab006383a9bfe4125ba1c5ab92e47bcbf79b46", size = 19933, upload-time = "2025-10-02T13:32:09.526Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/f5/254d7ce4b3aa4a1a3a4f279e0cc74eec8b4d3a61641d8ffc6e983907f2ca/uuid_utils-0.11.1-cp39-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:4bc8cf73c375b9ea11baf70caacc2c4bf7ce9bfd804623aa0541e5656f3dbeaf", size = 581019, upload-time = "2025-10-02T13:31:32.239Z" }, + { url = "https://files.pythonhosted.org/packages/68/e6/f7d14c4e1988d8beb3ac9bd773f370376c704925bdfb07380f5476bb2986/uuid_utils-0.11.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:0d2cb3bcc6f5862d08a0ee868b18233bc63ba9ea0e85ea9f3f8e703983558eba", size = 294377, upload-time = "2025-10-02T13:31:34.01Z" }, + { url = "https://files.pythonhosted.org/packages/8e/40/847a9a0258e7a2a14b015afdaa06ee4754a2680db7b74bac159d594eeb18/uuid_utils-0.11.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:463400604f623969f198aba9133ebfd717636f5e34257340302b1c3ff685dc0f", size = 328070, upload-time = "2025-10-02T13:31:35.619Z" }, + { url = "https://files.pythonhosted.org/packages/44/0c/c5d342d31860c9b4f481ef31a4056825961f9b462d216555e76dcee580ea/uuid_utils-0.11.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aef66b935342b268c6ffc1796267a1d9e73135740a10fe7e4098e1891cbcc476", size = 333610, upload-time = "2025-10-02T13:31:37.058Z" }, + { url = "https://files.pythonhosted.org/packages/e1/4b/52edc023ffcb9ab9a4042a58974a79c39ba7a565e683f1fd9814b504cf13/uuid_utils-0.11.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd65c41b81b762278997de0d027161f27f9cc4058fa57bbc0a1aaa63a63d6d1a", size = 475669, upload-time = "2025-10-02T13:31:38.38Z" }, + { url = "https://files.pythonhosted.org/packages/59/81/ee55ee63264531bb1c97b5b6033ad6ec81b5cd77f89174e9aef3af3d8889/uuid_utils-0.11.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccfac9d5d7522d61accabb8c68448ead6407933415e67e62123ed6ed11f86510", size = 331946, upload-time = "2025-10-02T13:31:39.66Z" }, + { url = "https://files.pythonhosted.org/packages/cf/07/5d4be27af0e9648afa512f0d11bb6d96cb841dd6d29b57baa3fbf55fd62e/uuid_utils-0.11.1-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:003f48f05c01692d0c1f7e413d194e7299a1a364e0047a4eb904d3478b84eca1", size = 352920, upload-time = "2025-10-02T13:31:40.94Z" }, + { url = "https://files.pythonhosted.org/packages/5b/48/a69dddd9727512b0583b87bfff97d82a8813b28fb534a183c9e37033cfef/uuid_utils-0.11.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a5c936042120bdc30d62f539165beaa4a6ba7e817a89e5409a6f06dc62c677a9", size = 509413, upload-time = "2025-10-02T13:31:42.547Z" }, + { url = "https://files.pythonhosted.org/packages/66/0d/1b529a3870c2354dd838d5f133a1cba75220242b0061f04a904ca245a131/uuid_utils-0.11.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:2e16dcdbdf4cd34ffb31ead6236960adb50e6c962c9f4554a6ecfdfa044c6259", size = 529454, upload-time = "2025-10-02T13:31:44.338Z" }, + { url = "https://files.pythonhosted.org/packages/bd/f2/04a3f77c85585aac09d546edaf871a4012052fb8ace6dbddd153b4d50f02/uuid_utils-0.11.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f8b21fed11b23134502153d652c77c3a37fa841a9aa15a4e6186d440a22f1a0e", size = 498084, upload-time = "2025-10-02T13:31:45.601Z" }, + { url = "https://files.pythonhosted.org/packages/89/08/538b380b4c4b220f3222c970930fe459cc37f1dfc6c8dc912568d027f17d/uuid_utils-0.11.1-cp39-abi3-win32.whl", hash = "sha256:72abab5ab27c1b914e3f3f40f910532ae242df1b5f0ae43f1df2ef2f610b2a8c", size = 174314, upload-time = "2025-10-02T13:31:47.269Z" }, + { url = "https://files.pythonhosted.org/packages/00/66/971ec830094ac1c7d46381678f7138c1805015399805e7dd7769c893c9c8/uuid_utils-0.11.1-cp39-abi3-win_amd64.whl", hash = "sha256:5ed9962f8993ef2fd418205f92830c29344102f86871d99b57cef053abf227d9", size = 179214, upload-time = "2025-10-02T13:31:48.344Z" }, +] + [[package]] name = "virtualenv" version = "20.33.0" From 3e4d3271280843a4c5789f2f9aec99b6d0425b00 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 11 Nov 2025 20:57:25 +0000 Subject: [PATCH 05/11] feat: make datagram carry record_id --- src/orcapod/core/datagrams/arrow_datagram.py | 4 +- .../core/datagrams/arrow_tag_packet.py | 8 + src/orcapod/core/datagrams/base.py | 28 +- src/orcapod/core/datagrams/dict_datagram.py | 15 +- src/orcapod/core/datagrams/dict_tag_packet.py | 8 + src/orcapod/core/function_pod.py | 362 +++++++++++++----- src/orcapod/core/packet_function.py | 183 +++++---- .../protocols/core_protocols/datagrams.py | 3 +- .../core_protocols/packet_function.py | 3 +- src/orcapod/system_constants.py | 10 + 10 files changed, 433 insertions(+), 191 deletions(-) diff --git a/src/orcapod/core/datagrams/arrow_datagram.py b/src/orcapod/core/datagrams/arrow_datagram.py index 2399e56..428c212 100644 --- a/src/orcapod/core/datagrams/arrow_datagram.py +++ b/src/orcapod/core/datagrams/arrow_datagram.py @@ -57,6 +57,7 @@ def __init__( table: "pa.Table", meta_info: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, + record_id: str | None = None, **kwargs, ) -> None: """ @@ -76,7 +77,6 @@ def __init__( The input table is automatically split into data, meta, and context components based on column naming conventions. """ - super().__init__(**kwargs) # Validate table has exactly one row for datagram if len(table) != 1: @@ -100,7 +100,7 @@ def __init__( data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] # Initialize base class with data context - super().__init__(data_context=data_context, **kwargs) + super().__init__(data_context=data_context, record_id=record_id, **kwargs) meta_columns = [ col for col in table.column_names if col.startswith(constants.META_PREFIX) diff --git a/src/orcapod/core/datagrams/arrow_tag_packet.py b/src/orcapod/core/datagrams/arrow_tag_packet.py index 9dc0c31..e64978d 100644 --- a/src/orcapod/core/datagrams/arrow_tag_packet.py +++ b/src/orcapod/core/datagrams/arrow_tag_packet.py @@ -40,6 +40,8 @@ def __init__( table: "pa.Table", system_tags: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, + record_id: str | None = None, + **kwargs, ) -> None: if len(table) != 1: raise ValueError( @@ -49,6 +51,8 @@ def __init__( super().__init__( table=table, data_context=data_context, + record_id=record_id, + **kwargs, ) extracted_system_tag_columns = [ c @@ -237,6 +241,8 @@ def __init__( meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, data_context: str | contexts.DataContext | None = None, + record_id: str | None = None, + **kwargs, ) -> None: if len(table) != 1: raise ValueError( @@ -269,6 +275,8 @@ def __init__( data_table, meta_info=meta_info, data_context=data_context, + record_id=record_id, + **kwargs, ) self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] diff --git a/src/orcapod/core/datagrams/base.py b/src/orcapod/core/datagrams/base.py index 9495fac..4a35732 100644 --- a/src/orcapod/core/datagrams/base.py +++ b/src/orcapod/core/datagrams/base.py @@ -121,21 +121,18 @@ class BaseDatagram(ContentIdentifiableBase): is interpreted and used is left to concrete implementations. """ - def __init__(self, **kwargs): + def __init__(self, record_id: str | None = None, **kwargs): super().__init__(**kwargs) - self._uuid = None + self._record_id = record_id @property - def uuid(self) -> UUID: + def record_id(self) -> str: """ - Return the UUID of this datagram. - - Returns: - UUID: The unique identifier for this instance of datagram. + Returns record ID """ - if self._uuid is None: - self._uuid = UUID(bytes=uuid7().bytes) - return self._uuid + if self._record_id is None: + self._record_id = str(uuid7()) + return self._record_id # TODO: revisit handling of identity structure for datagrams def identity_structure(self) -> Any: @@ -286,12 +283,13 @@ def with_context_key(self, new_context_key: str) -> Self: return new_datagram # 8. Utility Operations - def copy(self, include_cache: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_record_id: bool = True) -> Self: """Create a shallow copy of the datagram.""" new_datagram = object.__new__(self.__class__) new_datagram._data_context = self._data_context - if include_cache: - # preserve uuid if cache is preserved - # TODO: revisit this logic - new_datagram._uuid = self._uuid + + if preserve_record_id: + new_datagram._record_id = self._record_id + else: + new_datagram._record_id = None return new_datagram diff --git a/src/orcapod/core/datagrams/dict_datagram.py b/src/orcapod/core/datagrams/dict_datagram.py index 9207708..4006bf0 100644 --- a/src/orcapod/core/datagrams/dict_datagram.py +++ b/src/orcapod/core/datagrams/dict_datagram.py @@ -61,6 +61,8 @@ def __init__( python_schema: PythonSchemaLike | None = None, meta_info: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, + record_id: str | None = None, + **kwargs, ) -> None: """ Initialize DictDatagram from dictionary data. @@ -97,7 +99,7 @@ def __init__( # Initialize base class with data context final_context = data_context or cast(str, extracted_context) - super().__init__(data_context=final_context) + super().__init__(data_context=final_context, record_id=record_id, **kwargs) # Store data and meta components separately (immutable) self._data = dict(data_columns) @@ -534,11 +536,16 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: full_data = dict(self._data) # User data full_data.update(new_meta_data) # Meta data - return self.__class__( + new_datagram = self.__class__( data=full_data, data_context=self._data_context, ) + # TODO: use copy instead + new_datagram._record_id = self._record_id + + return new_datagram + def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: """ Create a new DictDatagram with specified meta columns dropped. @@ -764,7 +771,7 @@ def with_columns( return new_datagram # 8. Utility Operations - def copy(self, include_cache: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_record_id:bool=True) -> Self: """ Create a shallow copy of the datagram. @@ -775,7 +782,7 @@ def copy(self, include_cache: bool = True) -> Self: Returns: New DictDatagram instance with copied data and caches. """ - new_datagram = super().copy() + new_datagram = super().copy(include_cache=include_cache, preserve_record_id=preserve_record_id) new_datagram._data = self._data.copy() new_datagram._meta_data = self._meta_data.copy() new_datagram._data_python_schema = self._data_python_schema.copy() diff --git a/src/orcapod/core/datagrams/dict_tag_packet.py b/src/orcapod/core/datagrams/dict_tag_packet.py index cdc7854..a53d9bf 100644 --- a/src/orcapod/core/datagrams/dict_tag_packet.py +++ b/src/orcapod/core/datagrams/dict_tag_packet.py @@ -34,6 +34,8 @@ def __init__( meta_info: Mapping[str, DataValue] | None = None, python_schema: dict[str, type] | None = None, data_context: str | contexts.DataContext | None = None, + record_id: str | None = None, + **kwargs, ) -> None: """ Initialize the tag with data. @@ -56,6 +58,8 @@ def __init__( python_schema=python_schema, meta_info=meta_info, data_context=data_context, + record_id=record_id, + **kwargs, ) self._system_tags = {**extracted_system_tags, **(system_tags or {})} @@ -246,6 +250,8 @@ def __init__( source_info: Mapping[str, str | None] | None = None, python_schema: PythonSchemaLike | None = None, data_context: str | contexts.DataContext | None = None, + record_id: str | None = None, + **kwargs, ) -> None: # normalize the data content and remove any source info keys data_only = { @@ -262,6 +268,8 @@ def __init__( python_schema=python_schema, meta_info=meta_info, data_context=data_context, + record_id=record_id, + **kwargs, ) self._source_info = {**contained_source_info, **(source_info or {})} diff --git a/src/orcapod/core/function_pod.py b/src/orcapod/core/function_pod.py index 9da0829..7654353 100644 --- a/src/orcapod/core/function_pod.py +++ b/src/orcapod/core/function_pod.py @@ -1,11 +1,12 @@ import logging from collections.abc import Callable, Collection, Iterator from typing import TYPE_CHECKING, Any, Protocol, cast - +from orcapod.protocols.database_protocols import ArrowDatabase +from orcapod.system_constants import constants from orcapod import contexts from orcapod.core.base import OrcapodBase from orcapod.core.operators import Join -from orcapod.core.packet_function import PythonPacketFunction +from orcapod.core.packet_function import PythonPacketFunction, CachedPacketFunction from orcapod.core.streams.base import StreamBase from orcapod.core.tracker import DEFAULT_TRACKER_MANAGER from orcapod.protocols.core_protocols import ( @@ -26,8 +27,10 @@ if TYPE_CHECKING: import pyarrow as pa + import polars as pl else: pa = LazyModule("pyarrow") + pl = LazyModule("polars") class FunctionPod(OrcapodBase): @@ -39,13 +42,17 @@ def __init__( ) -> None: super().__init__(**kwargs) self.tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER - self.packet_function = packet_function + self._packet_function = packet_function self._output_schema_hash = self.data_context.object_hasher.hash_object( self.packet_function.output_packet_schema ).to_string() + @property + def packet_function(self) -> PacketFunction: + return self._packet_function + def identity_structure(self) -> Any: - return self.packet_function + return self.packet_function.identity_structure() @property def uri(self) -> tuple[str, ...]: @@ -89,6 +96,19 @@ def validate_inputs(self, *streams: Stream) -> None: f"Incoming packet data type {incoming_packet_types} from {input_stream} is not compatible with expected input typespec {expected_packet_schema}" ) + def process_packet(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + """ + Process a single packet using the pod's packet function. + + Args: + tag: The tag associated with the packet + packet: The input packet to process + + Returns: + Packet | None: The processed output packet, or None if filtered out + """ + return tag, self.packet_function.call(packet) + def process( self, *streams: Stream, label: str | None = None ) -> "FunctionPodStream": @@ -124,13 +144,15 @@ def process( ) return output_stream - def __call__(self, *streams: Stream, **kwargs) -> "FunctionPodStream": + def __call__( + self, *streams: Stream, label: str | None = None + ) -> "FunctionPodStream": """ Convenience method to invoke the pod process on a collection of streams, """ logger.debug(f"Invoking pod {self} on streams through __call__: {streams}") # perform input stream validation - return self.process(*streams, **kwargs) + return self.process(*streams, label=label) def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: return self.multi_stream_handler().argument_symmetry(streams) @@ -172,12 +194,6 @@ def __init__( self._cached_output_table: pa.Table | None = None self._cached_content_hash_column: pa.Array | None = None - def identity_structure(self): - return ( - self._function_pod, - self._input_stream, - ) - @property def source(self) -> Pod: return self._function_pod @@ -223,7 +239,7 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: yield tag, packet else: # Process packet - output_packet = self._function_pod.packet_function.call(packet) + tag, output_packet = self._function_pod.process_packet(tag, packet) self._cached_output_packets[i] = (tag, output_packet) if output_packet is not None: # Update shared cache for future iterators (optimization) @@ -264,6 +280,8 @@ def as_table( all_tags_as_tables: pa.Table = pa.Table.from_pylist( all_tags, schema=tag_schema ) + # drop context key column from tags table + all_tags_as_tables = all_tags_as_tables.drop([constants.CONTEXT_KEY]) all_packets_as_tables: pa.Table = pa.Table.from_pylist( struct_packets, schema=packet_schema ) @@ -275,58 +293,58 @@ def as_table( "_cached_output_table should not be None here." ) - return self._cached_output_table - - # drop_columns = [] - # if not include_system_tags: - # # TODO: get system tags more effiicently - # drop_columns.extend( - # [ - # c - # for c in self._cached_output_table.column_names - # if c.startswith(constants.SYSTEM_TAG_PREFIX) - # ] - # ) - # if not include_source: - # drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) - # if not include_data_context: - # drop_columns.append(constants.CONTEXT_KEY) - - # output_table = self._cached_output_table.drop(drop_columns) - - # # lazily prepare content hash column if requested - # if include_content_hash: - # if self._cached_content_hash_column is None: - # content_hashes = [] - # # TODO: verify that order will be preserved - # for tag, packet in self.iter_packets(): - # content_hashes.append(packet.content_hash().to_string()) - # self._cached_content_hash_column = pa.array( - # content_hashes, type=pa.large_string() - # ) - # assert self._cached_content_hash_column is not None, ( - # "_cached_content_hash_column should not be None here." - # ) - # hash_column_name = ( - # "_content_hash" - # if include_content_hash is True - # else include_content_hash - # ) - # output_table = output_table.append_column( - # hash_column_name, self._cached_content_hash_column - # ) - - # if sort_by_tags: - # # TODO: reimplement using polars natively - # output_table = ( - # pl.DataFrame(output_table) - # .sort(by=self.keys()[0], descending=False) - # .to_arrow() - # ) - # # output_table = output_table.sort_by( - # # [(column, "ascending") for column in self.keys()[0]] - # # ) - # return output_table + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + + drop_columns = [] + if not column_config.system_tags: + # TODO: get system tags more effiicently + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + if not column_config.source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not column_config.context: + drop_columns.append(constants.CONTEXT_KEY) + + output_table = self._cached_output_table.drop(drop_columns) + + # lazily prepare content hash column if requested + if column_config.content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + # TODO: verify that order will be preserved + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash().to_string()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if column_config.content_hash is True + else column_config.content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + + if column_config.sort_by_tags: + # TODO: reimplement using polars natively + output_table = ( + pl.DataFrame(output_table) + .sort(by=self.keys()[0], descending=False) + .to_arrow() + ) + # output_table = output_table.sort_by( + # [(column, "ascending") for column in self.keys()[0]] + # ) + return output_table class CallableWithPod(Protocol): @@ -343,6 +361,7 @@ def function_pod( function_name: str | None = None, version: str = "v0.0", label: str | None = None, + result_database: ArrowDatabase | None = None, **kwargs, ) -> Callable[..., CallableWithPod]: """ @@ -373,6 +392,13 @@ def decorator(func: Callable) -> CallableWithPod: **kwargs, ) + # if database is provided, wrap in CachedPacketFunction + if result_database is not None: + packet_function = CachedPacketFunction( + packet_function, + result_database=result_database, + ) + # Create a simple typed function pod pod = FunctionPod( packet_function=packet_function, @@ -433,6 +459,186 @@ def process(self, *streams: Stream, label: str | None = None) -> FunctionPodStre return self._function_pod.process(*streams, label=label) +class FunctionPodNode(FunctionPod): + """ + A pod that caches the results of the wrapped pod. + This is useful for pods that are expensive to compute and can benefit from caching. + """ + + def __init__( + self, + packet_function: PacketFunction, + input_streams: Collection[Stream], + pipeline_database: ArrowDatabase, + result_database: ArrowDatabase | None = None, + pipeline_path_prefix: tuple[str, ...] = (), + **kwargs, + ): + result_path_prefix = () + if result_database is None: + result_database = pipeline_database + # set result path to be within the pipeline path with "_result" appended + result_path_prefix = pipeline_path_prefix + ("_result",) + + self._cached_packet_function = CachedPacketFunction( + packet_function, + result_database=result_database, + record_path_prefix=result_path_prefix, + ) + + super().__init__(self._cached_packet_function, **kwargs) + + self._input_streams = input_streams + + self._pipeline_database = pipeline_database + self._pipeline_path_prefix = pipeline_path_prefix + + # take the pipeline node hash and schema hashes + self._pipeline_node_hash = self.content_hash().to_string() + + # compute tag schema hash, inclusive of system tags + tag_schema, _ = self.output_schema(columns={"system_tags": True}) + self._tag_schema_hash = self.data_context.object_hasher.hash_object( + tag_schema + ).to_string() + + def node_identity_structure(self) -> Any: + return (self.packet_function, self.argument_symmetry(self._input_streams)) + + @property + def pipeline_path(self) -> tuple[str, ...]: + return self._pipeline_path_prefix + self.uri + + @property + def uri(self) -> tuple[str, ...]: + # TODO: revisit organization of the URI components + return self._cached_packet_function.uri + ( + f"node:{self._pipeline_node_hash}", + f"tag:{self._tag_schema_hash}", + ) + + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + return super().output_schema( + *self._input_streams, columns=columns, all_info=all_info + ) + + def process( + self, *streams: Stream, label: str | None = None + ) -> "FunctionPodStream": + if len(streams) > 0: + raise ValueError( + "FunctionPodNode.process does not accept external streams; input streams are fixed at initialization." + ) + return super().process(*self._input_streams, label=label) + + def process_packet( + self, + tag: Tag, + packet: Packet, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[Tag, Packet | None]: + """ + Process a single packet using the pod's packet function. + + Args: + tag: The tag associated with the packet + packet: The input packet to process + + Returns: + Packet | None: The processed output packet, or None if filtered out + """ + output_packet = self._cached_packet_function.call( + packet, + skip_cache_lookup=skip_cache_lookup, + skip_cache_insert=skip_cache_insert, + ) + + if output_packet is not None: + # check if the packet was computed or retrieved from cache + result_computed = bool( + output_packet.get_meta_value( + self._cached_packet_function.RESULT_COMPUTED_FLAG, False + ) + ) + self.add_pipeline_record( + tag, + packet, + packet_record_id=output_packet.record_id, + computed=result_computed, + ) + + return tag, output_packet + + def add_pipeline_record( + self, + tag: Tag, + input_packet: Packet, + packet_record_id: str, + computed: bool, + skip_cache_lookup: bool = False, + ) -> None: + # combine dp.Tag with packet content hash to compute entry hash + # TODO: add system tag columns + # TODO: consider using bytes instead of string representation + tag_with_hash = tag.as_table(columns={"system_tags": True}).append_column( + constants.INPUT_PACKET_HASH_COL, + pa.array([input_packet.content_hash().to_string()], type=pa.large_string()), + ) + + # unique entry ID is determined by the combination of tags, system_tags, and input_packet hash + entry_id = self.data_context.arrow_hasher.hash_table(tag_with_hash).to_string() + + # check presence of an existing entry with the same entry_id + existing_record = None + if not skip_cache_lookup: + existing_record = self._pipeline_database.get_record_by_id( + self.pipeline_path, + entry_id, + ) + + if existing_record is not None: + # if the record already exists, then skip adding + return + + # rename all keys to avoid potential collision with result columns + renamed_input_packet = input_packet.rename( + {k: f"_input_{k}" for k in input_packet.keys()} + ) + input_packet_info = ( + renamed_input_packet.as_table(columns={"source": True}) + .append_column( + constants.PACKET_RECORD_ID, + pa.array([packet_record_id], type=pa.large_string()), + ) + .append_column( + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + .append_column( + f"{constants.META_PREFIX}computed", + pa.array([computed], type=pa.bool_()), + ) + .drop_columns(list(renamed_input_packet.keys())) + ) + + combined_record = arrow_utils.hstack_tables( + tag.as_table(columns={"system_tags": True}), input_packet_info + ) + + self._pipeline_database.add_record( + self.pipeline_path, + entry_id, + combined_record, + skip_duplicates=False, + ) + + # class CachedFunctionPod(WrappedFunctionPod): # """ # A pod that caches the results of the wrapped pod. @@ -678,29 +884,3 @@ def process(self, *streams: Stream, label: str | None = None) -> FunctionPodStre # result_table, # meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, # ) - -# def get_all_cached_outputs( -# self, include_system_columns: bool = False -# ) -> "pa.Table | None": -# """ -# Get all records from the result store for this pod. -# If include_system_columns is True, include system columns in the result. -# """ -# record_id_column = ( -# constants.PACKET_RECORD_ID if include_system_columns else None -# ) -# result_table = self.result_database.get_all_records( -# self.record_path, record_id_column=record_id_column -# ) -# if result_table is None or result_table.num_rows == 0: -# return None - -# if not include_system_columns: -# # remove input packet hash and tiered pod ID columns -# pod_id_columns = [ -# f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() -# ] -# result_table = result_table.drop_columns(pod_id_columns) -# result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) - -# return result_table diff --git a/src/orcapod/core/packet_function.py b/src/orcapod/core/packet_function.py index 836fef8..ab4deac 100644 --- a/src/orcapod/core/packet_function.py +++ b/src/orcapod/core/packet_function.py @@ -6,6 +6,8 @@ from collections.abc import Callable, Collection, Iterable, Sequence from typing import TYPE_CHECKING, Any, Literal +from uuid_utils import uuid7 + from orcapod.core.base import OrcapodBase from orcapod.core.datagrams import DictPacket, ArrowPacket from orcapod.hashing.hash_utils import get_function_components, get_function_signature @@ -85,10 +87,10 @@ def __init__(self, version: str = "v0.0", **kwargs): self._active = True self._version = version - match = re.match(r"\D.*(\d+)", version) + match = re.match(r"\D*(\d+)\.(.*)", version) if match: self._major_version = int(match.group(1)) - self._minor_version = version[match.end(1) :] + self._minor_version = match.group(2) else: raise ValueError( f"Version string {version} does not contain a valid version number" @@ -98,15 +100,16 @@ def __init__(self, version: str = "v0.0", **kwargs): def uri(self) -> tuple[str, ...]: # TODO: make this more efficient return ( - f"{self.packet_function_type_id}", f"{self.canonical_function_name}", self.data_context.object_hasher.hash_object( self.output_packet_schema ).to_string(), + f"v{self.major_version}", + self.packet_function_type_id, ) def identity_structure(self) -> Any: - return self.get_function_variation_data() + return self.uri @property def major_version(self) -> int: @@ -315,7 +318,23 @@ def call(self, packet: Packet) -> Packet | None: f"Number of output keys {len(self._output_keys)}:{self._output_keys} does not match number of values returned by function {len(output_values)}" ) - return DictPacket({k: v for k, v in zip(self._output_keys, output_values)}) + def combine(*components: tuple[str, ...]) -> str: + inner_parsed = [":".join(component) for component in components] + return "::".join(inner_parsed) + + output_data = {k: v for k, v in zip(self._output_keys, output_values)} + + record_id = str(uuid7()) + + source_info = {k: combine(self.uri, (record_id,), (k,)) for k in output_data} + + return DictPacket( + output_data, + source_info=source_info, + record_id=record_id, + python_schema=self.output_packet_schema, + data_context=self.data_context, + ) async def async_call(self, packet: Packet) -> Packet | None: raise NotImplementedError("Async call not implemented for synchronous function") @@ -376,7 +395,7 @@ class CachedPacketFunction(PacketFunctionWrapper): """ # name of the column in the tag store that contains the packet hash - DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" + RESULT_COMPUTED_FLAG = f"{constants.META_PREFIX}computed" def __init__( self, @@ -386,8 +405,16 @@ def __init__( **kwargs, ) -> None: super().__init__(packet_function, **kwargs) - self._record_path_prefix = record_path_prefix self._result_database = result_database + self._record_path_prefix = record_path_prefix + self._auto_flush = True + + def set_auto_flush(self, on: bool = True) -> None: + """ + Set the auto-flush behavior of the result database. + If set to True, the result database will flush after each record is added. + """ + self._auto_flush = on @property def record_path(self) -> tuple[str, ...]: @@ -408,74 +435,18 @@ def call( output_packet = None if not skip_cache_lookup: print("Checking for cache...") + # lookup stored result for the input packet output_packet = self.get_cached_output_for_packet(packet) if output_packet is not None: print(f"Cache hit for {packet}!") if output_packet is None: output_packet = self._packet_function.call(packet) - if output_packet is not None and not skip_cache_insert: - self.record_packet(packet, output_packet) - - return output_packet - - def record_packet( - self, - input_packet: Packet, - output_packet: Packet, - skip_duplicates: bool = False, - ) -> Packet: - """ - Record the output packet against the input packet in the result store. - """ - - # TODO: consider incorporating execution_engine_opts into the record - data_table = output_packet.as_table(columns={"source": True, "context": True}) - - # for i, (k, v) in enumerate(self.tiered_pod_id.items()): - # # add the tiered pod ID to the data table - # data_table = data_table.add_column( - # i, - # f"{constants.POD_ID_PREFIX}{k}", - # pa.array([v], type=pa.large_string()), - # ) - - # add the input packet hash as a column - data_table = data_table.add_column( - 0, - constants.INPUT_PACKET_HASH_COL, - pa.array([input_packet.content_hash().to_string()], type=pa.large_string()), - ) - # # add execution engine information - # execution_engine_hash = execution_engine.name if execution_engine else "default" - # data_table = data_table.append_column( - # constants.EXECUTION_ENGINE, - # pa.array([execution_engine_hash], type=pa.large_string()), - # ) - - # add computation timestamp - timestamp = datetime.now(timezone.utc) - data_table = data_table.append_column( - constants.POD_TIMESTAMP, - pa.array([timestamp], type=pa.timestamp("us", tz="UTC")), - ) + if output_packet is not None: + if not skip_cache_insert: + self.record_packet(packet, output_packet) + # add meta column to indicate that this was computed + output_packet.with_meta_columns(**{self.RESULT_COMPUTED_FLAG: True}) - # if record_id is None: - # record_id = self.get_record_id( - # input_packet, execution_engine_hash=execution_engine_hash - # ) - - # self.result_database.add_record( - # self.record_path, - # record_id, - # data_table, - # skip_duplicates=skip_duplicates, - # ) - # if result_flag is None: - # # TODO: do more specific error handling - # raise ValueError( - # f"Failed to record packet {input_packet} in result store {self.result_store}" - # ) - # # TODO: make store return retrieved table return output_packet def get_cached_output_for_packet(self, input_packet: Packet) -> Packet | None: @@ -485,10 +456,6 @@ def get_cached_output_for_packet(self, input_packet: Packet) -> Packet | None: will be applied. If the output packet is not found, return None. """ - # result_table = self.result_store.get_record_by_id( - # self.record_path, - # self.get_entry_hash(input_packet), - # ) # get all records with matching the input packet hash # TODO: add match based on match_tier if specified @@ -498,10 +465,13 @@ def get_cached_output_for_packet(self, input_packet: Packet) -> Packet | None: constants.INPUT_PACKET_HASH_COL: input_packet.content_hash().to_string() } + RECORD_ID_COLUMN = "_record_id" result_table = self._result_database.get_records_with_column_value( self.record_path, constraints, + record_id_column=RECORD_ID_COLUMN, ) + if result_table is None or result_table.num_rows == 0: return None @@ -513,15 +483,76 @@ def get_cached_output_for_packet(self, input_packet: Packet) -> Packet | None: constants.POD_TIMESTAMP, ascending=False ).take([0]) - # result_table = result_table.drop_columns(pod_id_columns) - result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH_COL) + # extract the record_id column + record_id = result_table.to_pylist()[0][RECORD_ID_COLUMN] + result_table = result_table.drop_columns( + [RECORD_ID_COLUMN, constants.INPUT_PACKET_HASH_COL] + ) # note that data context will be loaded from the result store return ArrowPacket( result_table, - meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, + record_id=record_id, + meta_info={self.RESULT_COMPUTED_FLAG: False}, ) + def record_packet( + self, + input_packet: Packet, + output_packet: Packet, + skip_duplicates: bool = False, + ) -> Packet: + """ + Record the output packet against the input packet in the result store. + """ + + # TODO: consider incorporating execution_engine_opts into the record + data_table = output_packet.as_table(columns={"source": True, "context": True}) + + i = -1 + for i, (k, v) in enumerate(self.get_function_variation_data().items()): + # add the tiered pod ID to the data table + data_table = data_table.add_column( + i, + f"{constants.PF_VARIATION_PREFIX}{k}", + pa.array([v], type=pa.large_string()), + ) + + for j, (k, v) in enumerate(self.get_execution_data().items()): + # add the tiered pod ID to the data table + data_table = data_table.add_column( + i + j + 1, + f"{constants.PF_EXECUTION_PREFIX}{k}", + pa.array([v], type=pa.large_string()), + ) + + # add the input packet hash as a column + data_table = data_table.add_column( + 0, + constants.INPUT_PACKET_HASH_COL, + pa.array([input_packet.content_hash().to_string()], type=pa.large_string()), + ) + + # add computation timestamp + timestamp = datetime.now(timezone.utc) + data_table = data_table.append_column( + constants.POD_TIMESTAMP, + pa.array([timestamp], type=pa.timestamp("us", tz="UTC")), + ) + + self._result_database.add_record( + self.record_path, + output_packet.record_id, + data_table, + skip_duplicates=skip_duplicates, + ) + + if self._auto_flush: + self._result_database.flush() + + # TODO: make store return retrieved table + return output_packet + def get_all_cached_outputs( self, include_system_columns: bool = False ) -> "pa.Table | None": diff --git a/src/orcapod/protocols/core_protocols/datagrams.py b/src/orcapod/protocols/core_protocols/datagrams.py index ed6d6fa..5e6114f 100644 --- a/src/orcapod/protocols/core_protocols/datagrams.py +++ b/src/orcapod/protocols/core_protocols/datagrams.py @@ -11,7 +11,6 @@ from orcapod.protocols.hashing_protocols import ContentIdentifiable from orcapod.types import DataType, DataValue, PythonSchema -from uuid import UUID if TYPE_CHECKING: import pyarrow as pa @@ -179,7 +178,7 @@ class Datagram(ContentIdentifiable, Protocol): """ @property - def uuid(self) -> UUID: + def record_id(self) -> str: """ Return the UUID of this datagram. diff --git a/src/orcapod/protocols/core_protocols/packet_function.py b/src/orcapod/protocols/core_protocols/packet_function.py index a20e269..059c629 100644 --- a/src/orcapod/protocols/core_protocols/packet_function.py +++ b/src/orcapod/protocols/core_protocols/packet_function.py @@ -3,10 +3,11 @@ from orcapod.protocols.core_protocols.datagrams import Packet from orcapod.protocols.core_protocols.labelable import Labelable from orcapod.types import PythonSchema +from orcapod.protocols.hashing_protocols import ContentIdentifiable @runtime_checkable -class PacketFunction(Labelable, Protocol): +class PacketFunction(ContentIdentifiable, Labelable, Protocol): """ Protocol for packet-processing function. diff --git a/src/orcapod/system_constants.py b/src/orcapod/system_constants.py index c52d77a..8925217 100644 --- a/src/orcapod/system_constants.py +++ b/src/orcapod/system_constants.py @@ -3,6 +3,8 @@ DATAGRAM_PREFIX = "_" SOURCE_INFO_PREFIX = "source_" POD_ID_PREFIX = "pod_id_" +PF_VARIATION_PREFIX = "pf_var_" +PF_EXECUTION_PREFIX = "pf_exec_" DATA_CONTEXT_KEY = "context_key" INPUT_PACKET_HASH_COL = "input_packet_hash" PACKET_RECORD_ID = "packet_id" @@ -47,6 +49,14 @@ def CONTEXT_KEY(self) -> str: def POD_ID_PREFIX(self) -> str: return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{POD_ID_PREFIX}" + @property + def PF_VARIATION_PREFIX(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{PF_VARIATION_PREFIX}" + + @property + def PF_EXECUTION_PREFIX(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{PF_EXECUTION_PREFIX}" + @property def INPUT_PACKET_HASH_COL(self) -> str: return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{INPUT_PACKET_HASH_COL}" From 604961c69c51c9c747d78729b4dd3b38612bb6a3 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 12 Nov 2025 18:26:14 +0000 Subject: [PATCH 06/11] chore: add pre-commit hooks for auto formatting and typing --- .pre-commit-config.yaml | 21 +++++++++++++++++++++ pyproject.toml | 1 + uv.lock | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f2cfe53 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,21 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - id: check-merge-conflict + + - repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.381 + hooks: + - id: pyright diff --git a/pyproject.toml b/pyproject.toml index eb38aba..fb9dd84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ dev = [ "ipywidgets>=8.1.7", "jsonschema>=4.25.0", "minio>=7.2.16", + "pre-commit>=4.4.0", "pyarrow-stubs>=20.0.0.20250716", "pygraphviz>=1.14", "pyiceberg>=0.9.1", diff --git a/uv.lock b/uv.lock index d4f48ba..fe6efd0 100644 --- a/uv.lock +++ b/uv.lock @@ -389,6 +389,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" }, ] +[[package]] +name = "cfgv" +version = "3.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, +] + [[package]] name = "charset-normalizer" version = "3.4.2" @@ -976,6 +985,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" }, ] +[[package]] +name = "identify" +version = "2.6.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -1787,6 +1805,7 @@ dev = [ { name = "ipywidgets" }, { name = "jsonschema" }, { name = "minio" }, + { name = "pre-commit" }, { name = "pyarrow-stubs" }, { name = "pygraphviz" }, { name = "pyiceberg" }, @@ -1831,6 +1850,7 @@ dev = [ { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "jsonschema", specifier = ">=4.25.0" }, { name = "minio", specifier = ">=7.2.16" }, + { name = "pre-commit", specifier = ">=4.4.0" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pygraphviz", specifier = ">=1.14" }, { name = "pyiceberg", specifier = ">=0.9.1" }, @@ -2015,6 +2035,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/4b/0673a68ac4d6527fac951970e929c3b4440c654f994f0c957bd5556deb38/polars-1.31.0-cp39-abi3-win_arm64.whl", hash = "sha256:62ef23bb9d10dca4c2b945979f9a50812ac4ace4ed9e158a6b5d32a7322e6f75", size = 31469078, upload-time = "2025-06-18T11:59:59.242Z" }, ] +[[package]] +name = "pre-commit" +version = "4.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cfgv" }, + { name = "identify" }, + { name = "nodeenv" }, + { name = "pyyaml" }, + { name = "virtualenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/49/7845c2d7bf6474efd8e27905b51b11e6ce411708c91e829b93f324de9929/pre_commit-4.4.0.tar.gz", hash = "sha256:f0233ebab440e9f17cabbb558706eb173d19ace965c68cdce2c081042b4fab15", size = 197501, upload-time = "2025-11-08T21:12:11.607Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/11/574fe7d13acf30bfd0a8dd7fa1647040f2b8064f13f43e8c963b1e65093b/pre_commit-4.4.0-py2.py3-none-any.whl", hash = "sha256:b35ea52957cbf83dcc5d8ee636cbead8624e3a15fbfa61a370e42158ac8a5813", size = 226049, upload-time = "2025-11-08T21:12:10.228Z" }, +] + [[package]] name = "prometheus-client" version = "0.22.1" From 1a56f8714386f2252935db72f446351213744369 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 12 Nov 2025 18:47:21 +0000 Subject: [PATCH 07/11] style: apply pre-commit to all files --- .pre-commit-config.yaml | 18 +-- pyproject.toml | 3 +- src/orcapod/__init__.py | 1 - src/orcapod/core/pods.py | 6 +- src/orcapod/core/streams/base.py | 4 +- src/orcapod/core/streams/lazy_pod_stream.py | 3 +- src/orcapod/core/streams/pod_node_stream.py | 18 +-- src/orcapod/hashing/string_cachers.py | 4 +- src/orcapod/pipeline/graph.py | 7 +- src/orcapod/pipeline/nodes.py | 5 +- src/orcapod/protocols/core_protocols/base.py | 1 + tests/test_hashing/test_sqlite_cacher.py | 4 +- .../test_string_cacher/test_sqlite_cacher.py | 4 +- uv.lock | 113 ++++++++++++++---- 14 files changed, 130 insertions(+), 61 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f2cfe53..12724a5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,21 +1,21 @@ repos: + - repo: https://github.com/tsvikas/sync-with-uv + rev: v0.4.0 # replace with the latest version + hooks: + - id: sync-with-uv - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.9 + rev: v0.14.4 hooks: - - id: ruff - args: [--fix] - id: ruff-format + types_or: [ python, pyi ] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v6.0.0 hooks: - id: trailing-whitespace + types_or: [ python, pyi ] - id: end-of-file-fixer + types_or: [ python, pyi ] - id: check-yaml - id: check-added-large-files - id: check-merge-conflict - - - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.381 - hooks: - - id: pyright diff --git a/pyproject.toml b/pyproject.toml index fb9dd84..297f883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dev = [ "jsonschema>=4.25.0", "minio>=7.2.16", "pre-commit>=4.4.0", + "pre-commit-hooks>=6.0.0", "pyarrow-stubs>=20.0.0.20250716", "pygraphviz>=1.14", "pyiceberg>=0.9.1", @@ -63,7 +64,7 @@ dev = [ "pytest-cov>=6.1.1", "ray[default]==2.48.0", "redis>=6.2.0", - "ruff>=0.11.11", + "ruff>=0.14.4", "sphinx>=8.2.3", "tqdm>=4.67.1", ] diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 226850e..f1ebd4b 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -9,7 +9,6 @@ from .pipeline import Pipeline - no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking __all__ = [ diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/pods.py index 9e2f9ad..f90f629 100644 --- a/src/orcapod/core/pods.py +++ b/src/orcapod/core/pods.py @@ -254,7 +254,6 @@ def function_pod( """ def decorator(func: Callable) -> CallableWithPod: - if func.__name__ == "": raise ValueError("Lambda functions cannot be used with function_pod") @@ -276,6 +275,7 @@ def wrapper(*args, **kwargs): ) setattr(wrapper, "pod", pod) return cast(CallableWithPod, wrapper) + return decorator @@ -496,9 +496,7 @@ async def async_call( if execution_engine is not None: # use the provided execution engine to run the function values = await execution_engine.submit_async( - self.function, - fn_kwargs=input_dict, - **(execution_engine_opts or {}) + self.function, fn_kwargs=input_dict, **(execution_engine_opts or {}) ) else: values = self.function(**input_dict) diff --git a/src/orcapod/core/streams/base.py b/src/orcapod/core/streams/base.py index 2959cf3..8b15b8e 100644 --- a/src/orcapod/core/streams/base.py +++ b/src/orcapod/core/streams/base.py @@ -477,7 +477,9 @@ def flow( def _repr_html_(self) -> str: df = self.as_polars_df() # reorder columns - new_column_order = [c for c in df.columns if c in self.tag_keys()] + [c for c in df.columns if c not in self.tag_keys()] + new_column_order = [c for c in df.columns if c in self.tag_keys()] + [ + c for c in df.columns if c not in self.tag_keys() + ] df = df[new_column_order] tag_map = {t: f"*{t}" for t in self.tag_keys()} # TODO: construct repr html better diff --git a/src/orcapod/core/streams/lazy_pod_stream.py b/src/orcapod/core/streams/lazy_pod_stream.py index 23f146a..bac8990 100644 --- a/src/orcapod/core/streams/lazy_pod_stream.py +++ b/src/orcapod/core/streams/lazy_pod_stream.py @@ -225,7 +225,8 @@ def as_table( # TODO: verify that order will be preserved for tag, packet in self.iter_packets( execution_engine=execution_engine or self.execution_engine, - execution_engine_opts=execution_engine_opts or self._execution_engine_opts, + execution_engine_opts=execution_engine_opts + or self._execution_engine_opts, ): content_hashes.append(packet.content_hash().to_string()) self._cached_content_hash_column = pa.array( diff --git a/src/orcapod/core/streams/pod_node_stream.py b/src/orcapod/core/streams/pod_node_stream.py index 4596bcb..a372137 100644 --- a/src/orcapod/core/streams/pod_node_stream.py +++ b/src/orcapod/core/streams/pod_node_stream.py @@ -67,7 +67,8 @@ async def run_async( This is typically called before iterating over the packets. """ if self._cached_output_packets is None: - cached_results, missing = self._identify_existing_and_missing_entries(*args, + cached_results, missing = self._identify_existing_and_missing_entries( + *args, execution_engine=execution_engine, execution_engine_opts=execution_engine_opts, **kwargs, @@ -90,6 +91,7 @@ async def run_async( pending_calls.append(pending) import asyncio + completed_calls = await asyncio.gather(*pending_calls) for result in completed_calls: cached_results.append(result) @@ -99,12 +101,14 @@ async def run_async( self._set_modified_time() self.pod_node.flush() - def _identify_existing_and_missing_entries(self, - *args: Any, + def _identify_existing_and_missing_entries( + self, + *args: Any, execution_engine: cp.ExecutionEngine | None = None, execution_engine_opts: dict[str, Any] | None = None, - **kwargs: Any) -> tuple[list[tuple[cp.Tag, cp.Packet|None]], pa.Table | None]: - cached_results: list[tuple[cp.Tag, cp.Packet|None]] = [] + **kwargs: Any, + ) -> tuple[list[tuple[cp.Tag, cp.Packet | None]], pa.Table | None]: + cached_results: list[tuple[cp.Tag, cp.Packet | None]] = [] # identify all entries in the input stream for which we still have not computed packets if len(args) > 0 or len(kwargs) > 0: @@ -177,8 +181,6 @@ def _identify_existing_and_missing_entries(self, for tag, packet in existing_stream.iter_packets(): cached_results.append((tag, packet)) - - return cached_results, missing def run( @@ -230,7 +232,6 @@ def run( ) cached_results.append((tag, output_packet)) - # reset the cache and set new results self.clear_cache() self._cached_output_packets = cached_results @@ -276,7 +277,6 @@ def iter_packets( self._cached_output_packets = cached_results self._set_modified_time() - def keys( self, include_system_tags: bool = False ) -> tuple[tuple[str, ...], tuple[str, ...]]: diff --git a/src/orcapod/hashing/string_cachers.py b/src/orcapod/hashing/string_cachers.py index caa6c93..21e93bb 100644 --- a/src/orcapod/hashing/string_cachers.py +++ b/src/orcapod/hashing/string_cachers.py @@ -316,7 +316,7 @@ def _init_database(self) -> None: ) """) conn.execute(""" - CREATE INDEX IF NOT EXISTS idx_last_accessed + CREATE INDEX IF NOT EXISTS idx_last_accessed ON cache_entries(last_accessed) """) conn.commit() @@ -330,7 +330,7 @@ def _load_from_database(self) -> None: try: with sqlite3.connect(self.db_path) as conn: cursor = conn.execute(""" - SELECT key, value FROM cache_entries + SELECT key, value FROM cache_entries ORDER BY last_accessed DESC """) diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 45d83e0..84bb565 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -45,8 +45,6 @@ def run_in_thread(): return asyncio.run(async_func(*args, **kwargs)) - - class GraphNode: def __init__(self, label: str, id: int, kernel_type: str): self.label = label @@ -230,7 +228,10 @@ def run( may implement more efficient graph traversal algorithms. """ import networkx as nx - if run_async is True and (execution_engine is None or not execution_engine.supports_async): + + if run_async is True and ( + execution_engine is None or not execution_engine.supports_async + ): raise ValueError( "Cannot run asynchronously with an execution engine that does not support async." ) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index af63971..3eace50 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -270,7 +270,7 @@ def __init__( def execution_engine_opts(self) -> dict[str, Any]: return self._execution_engine_opts.copy() - @execution_engine_opts.setter + @execution_engine_opts.setter def execution_engine_opts(self, opts: dict[str, Any]) -> None: self._execution_engine_opts = opts @@ -322,7 +322,6 @@ def call( if execution_engine_opts is not None: combined_execution_engine_opts.update(execution_engine_opts) - tag, output_packet = super().call( tag, packet, @@ -362,12 +361,10 @@ async def async_call( if record_id is None: record_id = self.get_record_id(packet, execution_engine_hash) - combined_execution_engine_opts = self.execution_engine_opts if execution_engine_opts is not None: combined_execution_engine_opts.update(execution_engine_opts) - tag, output_packet = await super().async_call( tag, packet, diff --git a/src/orcapod/protocols/core_protocols/base.py b/src/orcapod/protocols/core_protocols/base.py index 87d9a81..4d4dc45 100644 --- a/src/orcapod/protocols/core_protocols/base.py +++ b/src/orcapod/protocols/core_protocols/base.py @@ -41,6 +41,7 @@ class ExecutionEngine(Protocol): "local", "threadpool", "processpool", or "ray" and is used for logging and diagnostics. """ + @property def supports_async(self) -> bool: """Indicate whether this engine supports async execution.""" diff --git a/tests/test_hashing/test_sqlite_cacher.py b/tests/test_hashing/test_sqlite_cacher.py index 6018b30..6031392 100644 --- a/tests/test_hashing/test_sqlite_cacher.py +++ b/tests/test_hashing/test_sqlite_cacher.py @@ -47,7 +47,7 @@ def test_database_initialization(): # Check that table exists with correct schema with sqlite3.connect(db_file) as conn: cursor = conn.execute(""" - SELECT sql FROM sqlite_master + SELECT sql FROM sqlite_master WHERE type='table' AND name='cache_entries' """) schema = cursor.fetchone()[0] @@ -58,7 +58,7 @@ def test_database_initialization(): # Check that index exists cursor = conn.execute(""" - SELECT name FROM sqlite_master + SELECT name FROM sqlite_master WHERE type='index' AND name='idx_last_accessed' """) assert cursor.fetchone() is not None diff --git a/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py b/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py index f51069b..3ead001 100644 --- a/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py +++ b/tests/test_hashing/test_string_cacher/test_sqlite_cacher.py @@ -47,7 +47,7 @@ def test_database_initialization(): # Check that table exists with correct schema with sqlite3.connect(db_file) as conn: cursor = conn.execute(""" - SELECT sql FROM sqlite_master + SELECT sql FROM sqlite_master WHERE type='table' AND name='cache_entries' """) schema = cursor.fetchone()[0] @@ -58,7 +58,7 @@ def test_database_initialization(): # Check that index exists cursor = conn.execute(""" - SELECT name FROM sqlite_master + SELECT name FROM sqlite_master WHERE type='index' AND name='idx_last_accessed' """) assert cursor.fetchone() is not None diff --git a/uv.lock b/uv.lock index fe6efd0..34bba27 100644 --- a/uv.lock +++ b/uv.lock @@ -1806,6 +1806,7 @@ dev = [ { name = "jsonschema" }, { name = "minio" }, { name = "pre-commit" }, + { name = "pre-commit-hooks" }, { name = "pyarrow-stubs" }, { name = "pygraphviz" }, { name = "pyiceberg" }, @@ -1851,6 +1852,7 @@ dev = [ { name = "jsonschema", specifier = ">=4.25.0" }, { name = "minio", specifier = ">=7.2.16" }, { name = "pre-commit", specifier = ">=4.4.0" }, + { name = "pre-commit-hooks", specifier = ">=6.0.0" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pygraphviz", specifier = ">=1.14" }, { name = "pyiceberg", specifier = ">=0.9.1" }, @@ -1859,7 +1861,7 @@ dev = [ { name = "pytest-cov", specifier = ">=6.1.1" }, { name = "ray", extras = ["default"], specifier = "==2.48.0" }, { name = "redis", specifier = ">=6.2.0" }, - { name = "ruff", specifier = ">=0.11.11" }, + { name = "ruff", specifier = ">=0.14.4" }, { name = "sphinx", specifier = ">=8.2.3" }, { name = "tqdm", specifier = ">=4.67.1" }, ] @@ -2051,6 +2053,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/27/11/574fe7d13acf30bfd0a8dd7fa1647040f2b8064f13f43e8c963b1e65093b/pre_commit-4.4.0-py2.py3-none-any.whl", hash = "sha256:b35ea52957cbf83dcc5d8ee636cbead8624e3a15fbfa61a370e42158ac8a5813", size = 226049, upload-time = "2025-11-08T21:12:10.228Z" }, ] +[[package]] +name = "pre-commit-hooks" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ruamel-yaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/36/4d/93e63e48f8fd16d6c1e4cef5dabadcade4d1325c7fd6f29f075a4d2284f3/pre_commit_hooks-6.0.0.tar.gz", hash = "sha256:76d8370c006f5026cdd638a397a678d26dda735a3c88137e05885a020f824034", size = 28293, upload-time = "2025-08-09T19:25:04.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/46/eba9be9daa403fa94854ce16a458c29df9a01c6c047931c3d8be6016cd9a/pre_commit_hooks-6.0.0-py2.py3-none-any.whl", hash = "sha256:76161b76d321d2f8ee2a8e0b84c30ee8443e01376121fd1c90851e33e3bd7ee2", size = 41338, upload-time = "2025-08-09T19:25:03.513Z" }, +] + [[package]] name = "prometheus-client" version = "0.22.1" @@ -2891,29 +2905,84 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, ] +[[package]] +name = "ruamel-yaml" +version = "0.18.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ruamel-yaml-clib", marker = "python_full_version < '3.14' and platform_python_implementation == 'CPython'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/c7/ee630b29e04a672ecfc9b63227c87fd7a37eb67c1bf30fe95376437f897c/ruamel.yaml-0.18.16.tar.gz", hash = "sha256:a6e587512f3c998b2225d68aa1f35111c29fad14aed561a26e73fab729ec5e5a", size = 147269, upload-time = "2025-10-22T17:54:02.346Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/73/bb1bc2529f852e7bf64a2dec885e89ff9f5cc7bbf6c9340eed30ff2c69c5/ruamel.yaml-0.18.16-py3-none-any.whl", hash = "sha256:048f26d64245bae57a4f9ef6feb5b552a386830ef7a826f235ffb804c59efbba", size = 119858, upload-time = "2025-10-22T17:53:59.012Z" }, +] + +[[package]] +name = "ruamel-yaml-clib" +version = "0.2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/e9/39ec4d4b3f91188fad1842748f67d4e749c77c37e353c4e545052ee8e893/ruamel.yaml.clib-0.2.14.tar.gz", hash = "sha256:803f5044b13602d58ea378576dd75aa759f52116a0232608e8fdada4da33752e", size = 225394, upload-time = "2025-09-22T19:51:23.753Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/9f/3c51e9578b8c36fcc4bdd271a1a5bb65963a74a4b6ad1a989768a22f6c2a/ruamel.yaml.clib-0.2.14-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5bae1a073ca4244620425cd3d3aa9746bde590992b98ee8c7c8be8c597ca0d4e", size = 270207, upload-time = "2025-09-23T14:24:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/4a/16/cb02815bc2ae9c66760c0c061d23c7358f9ba51dae95ac85247662b7fbe2/ruamel.yaml.clib-0.2.14-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0a54e5e40a7a691a426c2703b09b0d61a14294d25cfacc00631aa6f9c964df0d", size = 137780, upload-time = "2025-09-22T19:50:37.734Z" }, + { url = "https://files.pythonhosted.org/packages/31/c6/fc687cd1b93bff8e40861eea46d6dc1a6a778d9a085684e4045ff26a8e40/ruamel.yaml.clib-0.2.14-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:10d9595b6a19778f3269399eff6bab642608e5966183abc2adbe558a42d4efc9", size = 641590, upload-time = "2025-09-22T19:50:41.978Z" }, + { url = "https://files.pythonhosted.org/packages/45/5d/65a2bc08b709b08576b3f307bf63951ee68a8e047cbbda6f1c9864ecf9a7/ruamel.yaml.clib-0.2.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dba72975485f2b87b786075e18a6e5d07dc2b4d8973beb2732b9b2816f1bad70", size = 738090, upload-time = "2025-09-22T19:50:39.152Z" }, + { url = "https://files.pythonhosted.org/packages/fb/d0/a70a03614d9a6788a3661ab1538879ed2aae4e84d861f101243116308a37/ruamel.yaml.clib-0.2.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29757bdb7c142f9595cc1b62ec49a3d1c83fab9cef92db52b0ccebaad4eafb98", size = 700744, upload-time = "2025-09-22T19:50:40.811Z" }, + { url = "https://files.pythonhosted.org/packages/77/30/c93fa457611f79946d5cb6cc97493ca5425f3f21891d7b1f9b44eaa1b38e/ruamel.yaml.clib-0.2.14-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:557df28dbccf79b152fe2d1b935f6063d9cc431199ea2b0e84892f35c03bb0ee", size = 742321, upload-time = "2025-09-23T18:42:48.916Z" }, + { url = "https://files.pythonhosted.org/packages/40/85/e2c54ad637117cd13244a4649946eaa00f32edcb882d1f92df90e079ab00/ruamel.yaml.clib-0.2.14-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:26a8de280ab0d22b6e3ec745b4a5a07151a0f74aad92dd76ab9c8d8d7087720d", size = 743805, upload-time = "2025-09-22T19:50:43.58Z" }, + { url = "https://files.pythonhosted.org/packages/81/50/f899072c38877d8ef5382e0b3d47f8c4346226c1f52d6945d6f64fec6a2f/ruamel.yaml.clib-0.2.14-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e501c096aa3889133d674605ebd018471bc404a59cbc17da3c5924421c54d97c", size = 769529, upload-time = "2025-09-22T19:50:45.707Z" }, + { url = "https://files.pythonhosted.org/packages/99/7c/96d4b5075e30c65ea2064e40c2d657c7c235d7b6ef18751cf89a935b9041/ruamel.yaml.clib-0.2.14-cp311-cp311-win32.whl", hash = "sha256:915748cfc25b8cfd81b14d00f4bfdb2ab227a30d6d43459034533f4d1c207a2a", size = 100256, upload-time = "2025-09-22T19:50:48.26Z" }, + { url = "https://files.pythonhosted.org/packages/7d/8c/73ee2babd04e8bfcf1fd5c20aa553d18bf0ebc24b592b4f831d12ae46cc0/ruamel.yaml.clib-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:4ccba93c1e5a40af45b2f08e4591969fa4697eae951c708f3f83dcbf9f6c6bb1", size = 118234, upload-time = "2025-09-22T19:50:47.019Z" }, + { url = "https://files.pythonhosted.org/packages/b4/42/ccfb34a25289afbbc42017e4d3d4288e61d35b2e00cfc6b92974a6a1f94b/ruamel.yaml.clib-0.2.14-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:6aeadc170090ff1889f0d2c3057557f9cd71f975f17535c26a5d37af98f19c27", size = 271775, upload-time = "2025-09-23T14:24:12.771Z" }, + { url = "https://files.pythonhosted.org/packages/82/73/e628a92e80197ff6a79ab81ec3fa00d4cc082d58ab78d3337b7ba7043301/ruamel.yaml.clib-0.2.14-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5e56ac47260c0eed992789fa0b8efe43404a9adb608608631a948cee4fc2b052", size = 138842, upload-time = "2025-09-22T19:50:49.156Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c5/346c7094344a60419764b4b1334d9e0285031c961176ff88ffb652405b0c/ruamel.yaml.clib-0.2.14-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:a911aa73588d9a8b08d662b9484bc0567949529824a55d3885b77e8dd62a127a", size = 647404, upload-time = "2025-09-22T19:50:52.921Z" }, + { url = "https://files.pythonhosted.org/packages/df/99/65080c863eb06d4498de3d6c86f3e90595e02e159fd8529f1565f56cfe2c/ruamel.yaml.clib-0.2.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a05ba88adf3d7189a974b2de7a9d56731548d35dc0a822ec3dc669caa7019b29", size = 753141, upload-time = "2025-09-22T19:50:50.294Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e3/0de85f3e3333f8e29e4b10244374a202a87665d1131798946ee22cf05c7c/ruamel.yaml.clib-0.2.14-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb04c5650de6668b853623eceadcdb1a9f2fee381f5d7b6bc842ee7c239eeec4", size = 703477, upload-time = "2025-09-22T19:50:51.508Z" }, + { url = "https://files.pythonhosted.org/packages/d9/25/0d2f09d8833c7fd77ab8efeff213093c16856479a9d293180a0d89f6bed9/ruamel.yaml.clib-0.2.14-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:df3ec9959241d07bc261f4983d25a1205ff37703faf42b474f15d54d88b4f8c9", size = 741157, upload-time = "2025-09-23T18:42:50.408Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8c/959f10c2e2153cbdab834c46e6954b6dd9e3b109c8f8c0a3cf1618310985/ruamel.yaml.clib-0.2.14-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fbc08c02e9b147a11dfcaa1ac8a83168b699863493e183f7c0c8b12850b7d259", size = 745859, upload-time = "2025-09-22T19:50:54.497Z" }, + { url = "https://files.pythonhosted.org/packages/ed/6b/e580a7c18b485e1a5f30a32cda96b20364b0ba649d9d2baaf72f8bd21f83/ruamel.yaml.clib-0.2.14-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c099cafc1834d3c5dac305865d04235f7c21c167c8dd31ebc3d6bbc357e2f023", size = 770200, upload-time = "2025-09-22T19:50:55.718Z" }, + { url = "https://files.pythonhosted.org/packages/ef/44/3455eebc761dc8e8fdced90f2b0a3fa61e32ba38b50de4130e2d57db0f21/ruamel.yaml.clib-0.2.14-cp312-cp312-win32.whl", hash = "sha256:b5b0f7e294700b615a3bcf6d28b26e6da94e8eba63b079f4ec92e9ba6c0d6b54", size = 98829, upload-time = "2025-09-22T19:50:58.895Z" }, + { url = "https://files.pythonhosted.org/packages/76/ab/5121f7f3b651db93de546f8c982c241397aad0a4765d793aca1dac5eadee/ruamel.yaml.clib-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:a37f40a859b503304dd740686359fcf541d6fb3ff7fc10f539af7f7150917c68", size = 115570, upload-time = "2025-09-22T19:50:57.981Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ae/e3811f05415594025e96000349d3400978adaed88d8f98d494352d9761ee/ruamel.yaml.clib-0.2.14-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7e4f9da7e7549946e02a6122dcad00b7c1168513acb1f8a726b1aaf504a99d32", size = 269205, upload-time = "2025-09-23T14:24:15.06Z" }, + { url = "https://files.pythonhosted.org/packages/72/06/7d51f4688d6d72bb72fa74254e1593c4f5ebd0036be5b41fe39315b275e9/ruamel.yaml.clib-0.2.14-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:dd7546c851e59c06197a7c651335755e74aa383a835878ca86d2c650c07a2f85", size = 137417, upload-time = "2025-09-22T19:50:59.82Z" }, + { url = "https://files.pythonhosted.org/packages/5a/08/b4499234a420ef42960eeb05585df5cc7eb25ccb8c980490b079e6367050/ruamel.yaml.clib-0.2.14-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:1c1acc3a0209ea9042cc3cfc0790edd2eddd431a2ec3f8283d081e4d5018571e", size = 642558, upload-time = "2025-09-22T19:51:03.388Z" }, + { url = "https://files.pythonhosted.org/packages/b6/ba/1975a27dedf1c4c33306ee67c948121be8710b19387aada29e2f139c43ee/ruamel.yaml.clib-0.2.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2070bf0ad1540d5c77a664de07ebcc45eebd1ddcab71a7a06f26936920692beb", size = 744087, upload-time = "2025-09-22T19:51:00.897Z" }, + { url = "https://files.pythonhosted.org/packages/20/15/8a19a13d27f3bd09fa18813add8380a29115a47b553845f08802959acbce/ruamel.yaml.clib-0.2.14-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd8fe07f49c170e09d76773fb86ad9135e0beee44f36e1576a201b0676d3d1d", size = 699709, upload-time = "2025-09-22T19:51:02.075Z" }, + { url = "https://files.pythonhosted.org/packages/19/ee/8d6146a079ad21e534b5083c9ee4a4c8bec42f79cf87594b60978286b39a/ruamel.yaml.clib-0.2.14-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ff86876889ea478b1381089e55cf9e345707b312beda4986f823e1d95e8c0f59", size = 708926, upload-time = "2025-09-23T18:42:51.707Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f5/426b714abdc222392e68f3b8ad323930d05a214a27c7e7a0f06c69126401/ruamel.yaml.clib-0.2.14-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1f118b707eece8cf84ecbc3e3ec94d9db879d85ed608f95870d39b2d2efa5dca", size = 740202, upload-time = "2025-09-22T19:51:04.673Z" }, + { url = "https://files.pythonhosted.org/packages/3d/ac/3c5c2b27a183f4fda8a57c82211721c016bcb689a4a175865f7646db9f94/ruamel.yaml.clib-0.2.14-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b30110b29484adc597df6bd92a37b90e63a8c152ca8136aad100a02f8ba6d1b6", size = 765196, upload-time = "2025-09-22T19:51:05.916Z" }, + { url = "https://files.pythonhosted.org/packages/92/2e/06f56a71fd55021c993ed6e848c9b2e5e9cfce180a42179f0ddd28253f7c/ruamel.yaml.clib-0.2.14-cp313-cp313-win32.whl", hash = "sha256:f4e97a1cf0b7a30af9e1d9dad10a5671157b9acee790d9e26996391f49b965a2", size = 98635, upload-time = "2025-09-22T19:51:08.183Z" }, + { url = "https://files.pythonhosted.org/packages/51/79/76aba16a1689b50528224b182f71097ece338e7a4ab55e84c2e73443b78a/ruamel.yaml.clib-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:090782b5fb9d98df96509eecdbcaffd037d47389a89492320280d52f91330d78", size = 115238, upload-time = "2025-09-22T19:51:07.081Z" }, + { url = "https://files.pythonhosted.org/packages/21/e2/a59ff65c26aaf21a24eb38df777cb9af5d87ba8fc8107c163c2da9d1e85e/ruamel.yaml.clib-0.2.14-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:7df6f6e9d0e33c7b1d435defb185095386c469109de723d514142632a7b9d07f", size = 271441, upload-time = "2025-09-23T14:24:16.498Z" }, + { url = "https://files.pythonhosted.org/packages/6b/fa/3234f913fe9a6525a7b97c6dad1f51e72b917e6872e051a5e2ffd8b16fbb/ruamel.yaml.clib-0.2.14-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:70eda7703b8126f5e52fcf276e6c0f40b0d314674f896fc58c47b0aef2b9ae83", size = 137970, upload-time = "2025-09-22T19:51:09.472Z" }, + { url = "https://files.pythonhosted.org/packages/ef/ec/4edbf17ac2c87fa0845dd366ef8d5852b96eb58fcd65fc1ecf5fe27b4641/ruamel.yaml.clib-0.2.14-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a0cb71ccc6ef9ce36eecb6272c81afdc2f565950cdcec33ae8e6cd8f7fc86f27", size = 739639, upload-time = "2025-09-22T19:51:10.566Z" }, + { url = "https://files.pythonhosted.org/packages/15/18/b0e1fafe59051de9e79cdd431863b03593ecfa8341c110affad7c8121efc/ruamel.yaml.clib-0.2.14-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e7cb9ad1d525d40f7d87b6df7c0ff916a66bc52cb61b66ac1b2a16d0c1b07640", size = 764456, upload-time = "2025-09-22T19:51:11.736Z" }, +] + [[package]] name = "ruff" -version = "0.11.12" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/15/0a/92416b159ec00cdf11e5882a9d80d29bf84bba3dbebc51c4898bfbca1da6/ruff-0.11.12.tar.gz", hash = "sha256:43cf7f69c7d7c7d7513b9d59c5d8cafd704e05944f978614aa9faff6ac202603", size = 4202289, upload-time = "2025-05-29T13:31:40.037Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/60/cc/53eb79f012d15e136d40a8e8fc519ba8f55a057f60b29c2df34efd47c6e3/ruff-0.11.12-py3-none-linux_armv6l.whl", hash = "sha256:c7680aa2f0d4c4f43353d1e72123955c7a2159b8646cd43402de6d4a3a25d7cc", size = 10285597, upload-time = "2025-05-29T13:30:57.539Z" }, - { url = "https://files.pythonhosted.org/packages/e7/d7/73386e9fb0232b015a23f62fea7503f96e29c29e6c45461d4a73bac74df9/ruff-0.11.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:2cad64843da9f134565c20bcc430642de897b8ea02e2e79e6e02a76b8dcad7c3", size = 11053154, upload-time = "2025-05-29T13:31:00.865Z" }, - { url = "https://files.pythonhosted.org/packages/4e/eb/3eae144c5114e92deb65a0cb2c72326c8469e14991e9bc3ec0349da1331c/ruff-0.11.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9b6886b524a1c659cee1758140138455d3c029783d1b9e643f3624a5ee0cb0aa", size = 10403048, upload-time = "2025-05-29T13:31:03.413Z" }, - { url = "https://files.pythonhosted.org/packages/29/64/20c54b20e58b1058db6689e94731f2a22e9f7abab74e1a758dfba058b6ca/ruff-0.11.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc3a3690aad6e86c1958d3ec3c38c4594b6ecec75c1f531e84160bd827b2012", size = 10597062, upload-time = "2025-05-29T13:31:05.539Z" }, - { url = "https://files.pythonhosted.org/packages/29/3a/79fa6a9a39422a400564ca7233a689a151f1039110f0bbbabcb38106883a/ruff-0.11.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f97fdbc2549f456c65b3b0048560d44ddd540db1f27c778a938371424b49fe4a", size = 10155152, upload-time = "2025-05-29T13:31:07.986Z" }, - { url = "https://files.pythonhosted.org/packages/e5/a4/22c2c97b2340aa968af3a39bc38045e78d36abd4ed3fa2bde91c31e712e3/ruff-0.11.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74adf84960236961090e2d1348c1a67d940fd12e811a33fb3d107df61eef8fc7", size = 11723067, upload-time = "2025-05-29T13:31:10.57Z" }, - { url = "https://files.pythonhosted.org/packages/bc/cf/3e452fbd9597bcd8058856ecd42b22751749d07935793a1856d988154151/ruff-0.11.12-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b56697e5b8bcf1d61293ccfe63873aba08fdbcbbba839fc046ec5926bdb25a3a", size = 12460807, upload-time = "2025-05-29T13:31:12.88Z" }, - { url = "https://files.pythonhosted.org/packages/2f/ec/8f170381a15e1eb7d93cb4feef8d17334d5a1eb33fee273aee5d1f8241a3/ruff-0.11.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4d47afa45e7b0eaf5e5969c6b39cbd108be83910b5c74626247e366fd7a36a13", size = 12063261, upload-time = "2025-05-29T13:31:15.236Z" }, - { url = "https://files.pythonhosted.org/packages/0d/bf/57208f8c0a8153a14652a85f4116c0002148e83770d7a41f2e90b52d2b4e/ruff-0.11.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bf9603fe1bf949de8b09a2da896f05c01ed7a187f4a386cdba6760e7f61be", size = 11329601, upload-time = "2025-05-29T13:31:18.68Z" }, - { url = "https://files.pythonhosted.org/packages/c3/56/edf942f7fdac5888094d9ffa303f12096f1a93eb46570bcf5f14c0c70880/ruff-0.11.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08033320e979df3b20dba567c62f69c45e01df708b0f9c83912d7abd3e0801cd", size = 11522186, upload-time = "2025-05-29T13:31:21.216Z" }, - { url = "https://files.pythonhosted.org/packages/ed/63/79ffef65246911ed7e2290aeece48739d9603b3a35f9529fec0fc6c26400/ruff-0.11.12-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:929b7706584f5bfd61d67d5070f399057d07c70585fa8c4491d78ada452d3bef", size = 10449032, upload-time = "2025-05-29T13:31:23.417Z" }, - { url = "https://files.pythonhosted.org/packages/88/19/8c9d4d8a1c2a3f5a1ea45a64b42593d50e28b8e038f1aafd65d6b43647f3/ruff-0.11.12-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7de4a73205dc5756b8e09ee3ed67c38312dce1aa28972b93150f5751199981b5", size = 10129370, upload-time = "2025-05-29T13:31:25.777Z" }, - { url = "https://files.pythonhosted.org/packages/bc/0f/2d15533eaa18f460530a857e1778900cd867ded67f16c85723569d54e410/ruff-0.11.12-py3-none-musllinux_1_2_i686.whl", hash = "sha256:2635c2a90ac1b8ca9e93b70af59dfd1dd2026a40e2d6eebaa3efb0465dd9cf02", size = 11123529, upload-time = "2025-05-29T13:31:28.396Z" }, - { url = "https://files.pythonhosted.org/packages/4f/e2/4c2ac669534bdded835356813f48ea33cfb3a947dc47f270038364587088/ruff-0.11.12-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d05d6a78a89166f03f03a198ecc9d18779076ad0eec476819467acb401028c0c", size = 11577642, upload-time = "2025-05-29T13:31:30.647Z" }, - { url = "https://files.pythonhosted.org/packages/a7/9b/c9ddf7f924d5617a1c94a93ba595f4b24cb5bc50e98b94433ab3f7ad27e5/ruff-0.11.12-py3-none-win32.whl", hash = "sha256:f5a07f49767c4be4772d161bfc049c1f242db0cfe1bd976e0f0886732a4765d6", size = 10475511, upload-time = "2025-05-29T13:31:32.917Z" }, - { url = "https://files.pythonhosted.org/packages/fd/d6/74fb6d3470c1aada019ffff33c0f9210af746cca0a4de19a1f10ce54968a/ruff-0.11.12-py3-none-win_amd64.whl", hash = "sha256:5a4d9f8030d8c3a45df201d7fb3ed38d0219bccd7955268e863ee4a115fa0832", size = 11523573, upload-time = "2025-05-29T13:31:35.782Z" }, - { url = "https://files.pythonhosted.org/packages/44/42/d58086ec20f52d2b0140752ae54b355ea2be2ed46f914231136dd1effcc7/ruff-0.11.12-py3-none-win_arm64.whl", hash = "sha256:65194e37853158d368e333ba282217941029a28ea90913c67e558c611d04daa5", size = 10697770, upload-time = "2025-05-29T13:31:38.009Z" }, +version = "0.14.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/55/cccfca45157a2031dcbb5a462a67f7cf27f8b37d4b3b1cd7438f0f5c1df6/ruff-0.14.4.tar.gz", hash = "sha256:f459a49fe1085a749f15414ca76f61595f1a2cc8778ed7c279b6ca2e1fd19df3", size = 5587844, upload-time = "2025-11-06T22:07:45.033Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/b9/67240254166ae1eaa38dec32265e9153ac53645a6c6670ed36ad00722af8/ruff-0.14.4-py3-none-linux_armv6l.whl", hash = "sha256:e6604613ffbcf2297cd5dcba0e0ac9bd0c11dc026442dfbb614504e87c349518", size = 12606781, upload-time = "2025-11-06T22:07:01.841Z" }, + { url = "https://files.pythonhosted.org/packages/46/c8/09b3ab245d8652eafe5256ab59718641429f68681ee713ff06c5c549f156/ruff-0.14.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d99c0b52b6f0598acede45ee78288e5e9b4409d1ce7f661f0fa36d4cbeadf9a4", size = 12946765, upload-time = "2025-11-06T22:07:05.858Z" }, + { url = "https://files.pythonhosted.org/packages/14/bb/1564b000219144bf5eed2359edc94c3590dd49d510751dad26202c18a17d/ruff-0.14.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9358d490ec030f1b51d048a7fd6ead418ed0826daf6149e95e30aa67c168af33", size = 11928120, upload-time = "2025-11-06T22:07:08.023Z" }, + { url = "https://files.pythonhosted.org/packages/a3/92/d5f1770e9988cc0742fefaa351e840d9aef04ec24ae1be36f333f96d5704/ruff-0.14.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81b40d27924f1f02dfa827b9c0712a13c0e4b108421665322218fc38caf615c2", size = 12370877, upload-time = "2025-11-06T22:07:10.015Z" }, + { url = "https://files.pythonhosted.org/packages/e2/29/e9282efa55f1973d109faf839a63235575519c8ad278cc87a182a366810e/ruff-0.14.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f5e649052a294fe00818650712083cddc6cc02744afaf37202c65df9ea52efa5", size = 12408538, upload-time = "2025-11-06T22:07:13.085Z" }, + { url = "https://files.pythonhosted.org/packages/8e/01/930ed6ecfce130144b32d77d8d69f5c610e6d23e6857927150adf5d7379a/ruff-0.14.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa082a8f878deeba955531f975881828fd6afd90dfa757c2b0808aadb437136e", size = 13141942, upload-time = "2025-11-06T22:07:15.386Z" }, + { url = "https://files.pythonhosted.org/packages/6a/46/a9c89b42b231a9f487233f17a89cbef9d5acd538d9488687a02ad288fa6b/ruff-0.14.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1043c6811c2419e39011890f14d0a30470f19d47d197c4858b2787dfa698f6c8", size = 14544306, upload-time = "2025-11-06T22:07:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/78/96/9c6cf86491f2a6d52758b830b89b78c2ae61e8ca66b86bf5a20af73d20e6/ruff-0.14.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a9f3a936ac27fb7c2a93e4f4b943a662775879ac579a433291a6f69428722649", size = 14210427, upload-time = "2025-11-06T22:07:19.832Z" }, + { url = "https://files.pythonhosted.org/packages/71/f4/0666fe7769a54f63e66404e8ff698de1dcde733e12e2fd1c9c6efb689cb5/ruff-0.14.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:95643ffd209ce78bc113266b88fba3d39e0461f0cbc8b55fb92505030fb4a850", size = 13658488, upload-time = "2025-11-06T22:07:22.32Z" }, + { url = "https://files.pythonhosted.org/packages/ee/79/6ad4dda2cfd55e41ac9ed6d73ef9ab9475b1eef69f3a85957210c74ba12c/ruff-0.14.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:456daa2fa1021bc86ca857f43fe29d5d8b3f0e55e9f90c58c317c1dcc2afc7b5", size = 13354908, upload-time = "2025-11-06T22:07:24.347Z" }, + { url = "https://files.pythonhosted.org/packages/b5/60/f0b6990f740bb15c1588601d19d21bcc1bd5de4330a07222041678a8e04f/ruff-0.14.4-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:f911bba769e4a9f51af6e70037bb72b70b45a16db5ce73e1f72aefe6f6d62132", size = 13587803, upload-time = "2025-11-06T22:07:26.327Z" }, + { url = "https://files.pythonhosted.org/packages/c9/da/eaaada586f80068728338e0ef7f29ab3e4a08a692f92eb901a4f06bbff24/ruff-0.14.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:76158a7369b3979fa878612c623a7e5430c18b2fd1c73b214945c2d06337db67", size = 12279654, upload-time = "2025-11-06T22:07:28.46Z" }, + { url = "https://files.pythonhosted.org/packages/66/d4/b1d0e82cf9bf8aed10a6d45be47b3f402730aa2c438164424783ac88c0ed/ruff-0.14.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f3b8f3b442d2b14c246e7aeca2e75915159e06a3540e2f4bed9f50d062d24469", size = 12357520, upload-time = "2025-11-06T22:07:31.468Z" }, + { url = "https://files.pythonhosted.org/packages/04/f4/53e2b42cc82804617e5c7950b7079d79996c27e99c4652131c6a1100657f/ruff-0.14.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c62da9a06779deecf4d17ed04939ae8b31b517643b26370c3be1d26f3ef7dbde", size = 12719431, upload-time = "2025-11-06T22:07:33.831Z" }, + { url = "https://files.pythonhosted.org/packages/a2/94/80e3d74ed9a72d64e94a7b7706b1c1ebaa315ef2076fd33581f6a1cd2f95/ruff-0.14.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a443a83a1506c684e98acb8cb55abaf3ef725078be40237463dae4463366349", size = 13464394, upload-time = "2025-11-06T22:07:35.905Z" }, + { url = "https://files.pythonhosted.org/packages/54/1a/a49f071f04c42345c793d22f6cf5e0920095e286119ee53a64a3a3004825/ruff-0.14.4-py3-none-win32.whl", hash = "sha256:643b69cb63cd996f1fc7229da726d07ac307eae442dd8974dbc7cf22c1e18fff", size = 12493429, upload-time = "2025-11-06T22:07:38.43Z" }, + { url = "https://files.pythonhosted.org/packages/bc/22/e58c43e641145a2b670328fb98bc384e20679b5774258b1e540207580266/ruff-0.14.4-py3-none-win_amd64.whl", hash = "sha256:26673da283b96fe35fa0c939bf8411abec47111644aa9f7cfbd3c573fb125d2c", size = 13635380, upload-time = "2025-11-06T22:07:40.496Z" }, + { url = "https://files.pythonhosted.org/packages/30/bd/4168a751ddbbf43e86544b4de8b5c3b7be8d7167a2a5cb977d274e04f0a1/ruff-0.14.4-py3-none-win_arm64.whl", hash = "sha256:dd09c292479596b0e6fec8cd95c65c3a6dc68e9ad17b8f2382130f87ff6a75bb", size = 12663065, upload-time = "2025-11-06T22:07:42.603Z" }, ] [[package]] From 051afaffbaec3134c28082bea2966289b7a96796 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 13 Nov 2025 05:36:29 +0000 Subject: [PATCH 08/11] feat: feat: wip implementation of function pod node --- src/orcapod/core/function_pod.py | 323 +++++++++++++++++++++++++++---- 1 file changed, 288 insertions(+), 35 deletions(-) diff --git a/src/orcapod/core/function_pod.py b/src/orcapod/core/function_pod.py index 7654353..31ed02f 100644 --- a/src/orcapod/core/function_pod.py +++ b/src/orcapod/core/function_pod.py @@ -81,11 +81,10 @@ def validate_inputs(self, *streams: Stream) -> None: Raises: PodInputValidationError: If inputs are invalid """ - if len(streams) != 1: - raise ValueError( - f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" - ) - input_stream = streams[0] + input_stream = self.handle_input_streams(*streams) + self._validate_input(input_stream) + + def _validate_input(self, input_stream: Stream) -> None: _, incoming_packet_types = input_stream.output_schema() expected_packet_schema = self.packet_function.input_packet_schema if not schema_utils.check_typespec_compatibility( @@ -109,6 +108,22 @@ def process_packet(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: """ return tag, self.packet_function.call(packet) + def handle_input_streams(self, *streams: Stream) -> Stream: + """ + Handle multiple input streams by joining them if necessary. + + Args: + *streams: Input streams to handle + """ + # handle multiple input streams + if len(streams) == 0: + raise ValueError("At least one input stream is required") + elif len(streams) > 1: + multi_stream_handler = self.multi_stream_handler() + joined_stream = multi_stream_handler.process(*streams) + return joined_stream + return streams[0] + def process( self, *streams: Stream, label: str | None = None ) -> "FunctionPodStream": @@ -124,17 +139,10 @@ def process( """ logger.debug(f"Invoking kernel {self} on streams: {streams}") - # handle multiple input streams - if len(streams) == 0: - raise ValueError("At least one input stream is required") - elif len(streams) > 1: - multi_stream_handler = self.multi_stream_handler() - joined_stream = multi_stream_handler.process(*streams) - streams = (joined_stream,) - input_stream = streams[0] + input_stream = self.handle_input_streams(*streams) # perform input stream validation - self.validate_inputs(*streams) + self._validate_input(input_stream) self.tracker_manager.record_packet_function_invocation( self.packet_function, input_stream, label=label ) @@ -459,7 +467,7 @@ def process(self, *streams: Stream, label: str | None = None) -> FunctionPodStre return self._function_pod.process(*streams, label=label) -class FunctionPodNode(FunctionPod): +class FunctionPodNode(OrcapodBase): """ A pod that caches the results of the wrapped pod. This is useful for pods that are expensive to compute and can benefit from caching. @@ -468,12 +476,16 @@ class FunctionPodNode(FunctionPod): def __init__( self, packet_function: PacketFunction, - input_streams: Collection[Stream], + input_stream: Stream, pipeline_database: ArrowDatabase, result_database: ArrowDatabase | None = None, pipeline_path_prefix: tuple[str, ...] = (), + tracker_manager: TrackerManager | None = None, **kwargs, ): + if tracker_manager is None: + tracker_manager = DEFAULT_TRACKER_MANAGER + self.tracker_manager = tracker_manager result_path_prefix = () if result_database is None: result_database = pipeline_database @@ -486,9 +498,10 @@ def __init__( record_path_prefix=result_path_prefix, ) - super().__init__(self._cached_packet_function, **kwargs) + # initialize the base FunctionPod with the cached packet function + super().__init__(**kwargs) - self._input_streams = input_streams + self._input_stream = input_stream self._pipeline_database = pipeline_database self._pipeline_path_prefix = pipeline_path_prefix @@ -496,14 +509,18 @@ def __init__( # take the pipeline node hash and schema hashes self._pipeline_node_hash = self.content_hash().to_string() + self._output_schema_hash = self.data_context.object_hasher.hash_object( + self._cached_packet_function.output_packet_schema + ).to_string() + # compute tag schema hash, inclusive of system tags tag_schema, _ = self.output_schema(columns={"system_tags": True}) self._tag_schema_hash = self.data_context.object_hasher.hash_object( tag_schema ).to_string() - def node_identity_structure(self) -> Any: - return (self.packet_function, self.argument_symmetry(self._input_streams)) + def identity_structure(self) -> Any: + return (self._cached_packet_function, self._input_stream) @property def pipeline_path(self) -> tuple[str, ...]: @@ -517,24 +534,11 @@ def uri(self) -> tuple[str, ...]: f"tag:{self._tag_schema_hash}", ) - def output_schema( - self, - *streams: Stream, - columns: ColumnConfig | dict[str, Any] | None = None, - all_info: bool = False, - ) -> tuple[PythonSchema, PythonSchema]: - return super().output_schema( - *self._input_streams, columns=columns, all_info=all_info - ) - - def process( - self, *streams: Stream, label: str | None = None - ) -> "FunctionPodStream": + def validate_inputs(self, *streams: Stream) -> None: if len(streams) > 0: raise ValueError( - "FunctionPodNode.process does not accept external streams; input streams are fixed at initialization." + "FunctionPodNode.validate_inputs does not accept external streams; input streams are fixed at initialization." ) - return super().process(*self._input_streams, label=label) def process_packet( self, @@ -575,6 +579,64 @@ def process_packet( return tag, output_packet + def process( + self, *streams: Stream, label: str | None = None + ) -> "FunctionPodNodeStream": + """ + Invoke the packet processor on the input stream. + If multiple streams are passed in, all streams are joined before processing. + + Args: + *streams: Input streams to process + + Returns: + cp.Stream: The resulting output stream + """ + logger.debug(f"Invoking kernel {self} on streams: {streams}") + + # perform input stream validation + self.validate_inputs(self._input_stream) + self.tracker_manager.record_packet_function_invocation( + self._cached_packet_function, self._input_stream, label=label + ) + output_stream = FunctionPodNodeStream( + fp_node=self, + input_stream=self._input_stream, + ) + return output_stream + + def __call__( + self, *streams: Stream, label: str | None = None + ) -> "FunctionPodNodeStream": + """ + Convenience method to invoke the pod process on a collection of streams, + """ + logger.debug(f"Invoking pod {self} on streams through __call__: {streams}") + # perform input stream validation + return self.process(*streams, label=label) + + def argument_symmetry(self, streams: Collection[Stream]) -> ArgumentGroup: + if len(streams) > 0: + raise ValueError( + "FunctionPodNode.argument_symmetry does not accept external streams; input streams are fixed at initialization." + ) + return () + + def output_schema( + self, + *streams: Stream, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + # TODO: decide on how to handle extra inputs if provided + + tag_schema = self._input_stream.output_schema( + *streams, columns=columns, all_info=all_info + )[0] + # The output schema of the FunctionPod is determined by the packet function + # TODO: handle and extend to include additional columns + return tag_schema, self._cached_packet_function.output_packet_schema + def add_pipeline_record( self, tag: Tag, @@ -639,6 +701,197 @@ def add_pipeline_record( ) +class FunctionPodNodeStream(StreamBase): + """ + Recomputable stream wrapping a packet function. + """ + + def __init__( + self, fp_node: FunctionPodNode, input_stream: Stream, **kwargs + ) -> None: + super().__init__(**kwargs) + self._fp_node = fp_node + self._input_stream = input_stream + + # capture the iterator over the input stream + self._cached_input_iterator = input_stream.iter_packets() + self._update_modified_time() # update the modified time to AFTER we obtain the iterator + # note that the invocation of iter_packets on upstream likely triggeres the modified time + # to be updated on the usptream. Hence you want to set this stream's modified time after that. + + # Packet-level caching (for the output packets) + self._cached_output_packets: dict[int, tuple[Tag, Packet | None]] = {} + self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None + + def refresh_cache(self) -> None: + upstream_last_modified = self._input_stream.last_modified + if ( + upstream_last_modified is None + or self.last_modified is None + or upstream_last_modified > self.last_modified + ): + # input stream has been modified since last processing; refresh caches + # re-cache the iterator and clear out output packet cache + self._cached_input_iterator = self._input_stream.iter_packets() + self._cached_output_packets.clear() + self._cached_output_table = None + self._cached_content_hash_column = None + self._update_modified_time() + + @property + def source(self) -> FunctionPodNode: + return self._fp_node + + @property + def upstreams(self) -> tuple[Stream, ...]: + return (self._input_stream,) + + def keys( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + tag_schema, packet_schema = self.output_schema( + columns=columns, all_info=all_info + ) + + return tuple(tag_schema.keys()), tuple(packet_schema.keys()) + + def output_schema( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema = self._input_stream.output_schema( + columns=columns, all_info=all_info + )[0] + packet_schema = self._fp_node._cached_packet_function.output_packet_schema + return (tag_schema, packet_schema) + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + return self.iter_packets() + + def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: + if self._cached_input_iterator is not None: + for i, (tag, packet) in enumerate(self._cached_input_iterator): + if i in self._cached_output_packets: + # Use cached result + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + else: + # Process packet + tag, output_packet = self._fp_node.process_packet(tag, packet) + self._cached_output_packets[i] = (tag, output_packet) + if output_packet is not None: + # Update shared cache for future iterators (optimization) + yield tag, output_packet + + # Mark completion by releasing the iterator + self._cached_input_iterator = None + else: + # Yield from snapshot of complete cache + for i in range(len(self._cached_output_packets)): + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + + def as_table( + self, + *, + columns: ColumnConfig | dict[str, Any] | None = None, + all_info: bool = False, + ) -> "pa.Table": + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(): + if tag_schema is None: + tag_schema = tag.arrow_schema(all_info=True) + if packet_schema is None: + packet_schema = packet.arrow_schema(all_info=True) + # TODO: make use of arrow_compat dict + all_tags.append(tag.as_dict(all_info=True)) + all_packets.append(packet.as_dict(all_info=True)) + + # TODO: re-verify the implemetation of this conversion + converter = self.data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + # drop context key column from tags table + all_tags_as_tables = all_tags_as_tables.drop([constants.CONTEXT_KEY]) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + struct_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + column_config = ColumnConfig.handle_config(columns, all_info=all_info) + + drop_columns = [] + if not column_config.system_tags: + # TODO: get system tags more effiicently + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + if not column_config.source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not column_config.context: + drop_columns.append(constants.CONTEXT_KEY) + + output_table = self._cached_output_table.drop(drop_columns) + + # lazily prepare content hash column if requested + if column_config.content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + # TODO: verify that order will be preserved + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash().to_string()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if column_config.content_hash is True + else column_config.content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + + if column_config.sort_by_tags: + # TODO: reimplement using polars natively + output_table = ( + pl.DataFrame(output_table) + .sort(by=self.keys()[0], descending=False) + .to_arrow() + ) + # output_table = output_table.sort_by( + # [(column, "ascending") for column in self.keys()[0]] + # ) + return output_table + + # class CachedFunctionPod(WrappedFunctionPod): # """ # A pod that caches the results of the wrapped pod. From 9366d894cda128a4fdf923b87684376c713aab4b Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 2 Dec 2025 06:56:45 +0000 Subject: [PATCH 09/11] refactor: clean up inheritance hierarchy --- src/orcapod/core/__init__.py | 4 +- src/orcapod/core/base.py | 40 +- src/orcapod/core/datagrams/arrow_datagram.py | 11 +- .../core/datagrams/arrow_tag_packet.py | 10 +- src/orcapod/core/datagrams/base.py | 21 +- src/orcapod/core/datagrams/dict_datagram.py | 10 +- src/orcapod/core/datagrams/dict_tag_packet.py | 12 +- src/orcapod/core/execution_engine.py | 2 + src/orcapod/core/function_pod.py | 13 +- .../{streams => legacy}/cached_pod_stream.py | 0 .../{streams => legacy}/lazy_pod_stream.py | 10 +- src/orcapod/core/legacy/pod_node_stream.py | 424 ++++++++++++++++++ src/orcapod/core/{ => legacy}/pods.py | 0 src/orcapod/core/operators/base.py | 14 +- src/orcapod/core/operators/batch.py | 2 +- .../core/operators/column_selection.py | 10 +- src/orcapod/core/operators/filters.py | 6 +- src/orcapod/core/operators/join.py | 2 +- src/orcapod/core/operators/mappers.py | 4 +- src/orcapod/core/operators/semijoin.py | 4 +- src/orcapod/core/packet_function.py | 60 ++- ...executable_pod.py => static_output_pod.py} | 29 +- src/orcapod/core/streams/__init__.py | 18 +- src/orcapod/core/streams/base.py | 14 +- src/orcapod/core/streams/pod_node_stream.py | 424 ------------------ src/orcapod/core/streams/table_stream.py | 5 +- src/orcapod/core/tracker.py | 19 +- .../basic_delta_lake_arrow_database.py | 3 +- .../protocols/core_protocols/datagrams.py | 23 +- .../protocols/core_protocols/function_pod.py | 2 - .../core_protocols/orcapod_object.py | 11 + .../core_protocols/packet_function.py | 2 +- src/orcapod/protocols/core_protocols/pod.py | 8 +- .../protocols/core_protocols/streams.py | 6 +- .../{core => utils}/arrow_data_utils.py | 0 .../{core => utils}/polars_data_utils.py | 0 36 files changed, 631 insertions(+), 592 deletions(-) rename src/orcapod/core/{streams => legacy}/cached_pod_stream.py (100%) rename src/orcapod/core/{streams => legacy}/lazy_pod_stream.py (100%) create mode 100644 src/orcapod/core/legacy/pod_node_stream.py rename src/orcapod/core/{ => legacy}/pods.py (100%) rename src/orcapod/core/{executable_pod.py => static_output_pod.py} (91%) delete mode 100644 src/orcapod/core/streams/pod_node_stream.py create mode 100644 src/orcapod/protocols/core_protocols/orcapod_object.py rename src/orcapod/{core => utils}/arrow_data_utils.py (100%) rename src/orcapod/{core => utils}/polars_data_utils.py (100%) diff --git a/src/orcapod/core/__init__.py b/src/orcapod/core/__init__.py index f483ca0..724c67c 100644 --- a/src/orcapod/core/__init__.py +++ b/src/orcapod/core/__init__.py @@ -1,7 +1,5 @@ -from .tracker import DEFAULT_TRACKER_MANAGER -from ..system_constants import constants +from orcapod.core.tracker import DEFAULT_TRACKER_MANAGER __all__ = [ "DEFAULT_TRACKER_MANAGER", - "constants", ] diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index cb8d8f5..5f05835 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging from abc import ABC, abstractmethod from datetime import datetime, timezone @@ -14,6 +16,11 @@ class LabelableMixin: + """ + Mixin class for objects that can have a label. Provides a mechanism to compute a label based on the object's content. + By default, explicitly set label will always take precedence over computed label and inferred label. + """ + def __init__(self, label: str | None = None, **kwargs): self._label = label super().__init__(**kwargs) @@ -31,7 +38,7 @@ def label(self) -> str: @property def has_assigned_label(self) -> bool: """ - Check if the label is explicitly set for this object. + Check if the label has been explicitly set for this object. Returns: bool: True if the label is explicitly set, False otherwise. @@ -57,6 +64,11 @@ def computed_label(self) -> str | None: class DataContextMixin: + """ + Mixin to associate data context and an Orcapod config with an object. Deriving class allows data context and Orcapod config to be + explicitly specified and if not provided, use the default data context and Orcapod config. + """ + def __init__( self, data_context: str | contexts.DataContext | None = None, @@ -64,10 +76,12 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + self._data_context = contexts.resolve_context(data_context) if orcapod_config is None: - orcapod_config = DEFAULT_CONFIG + orcapod_config = ( + DEFAULT_CONFIG # DEFAULT_CONFIG as defined in orcapod/config.py + ) self._orcapod_config = orcapod_config - self._data_context = contexts.resolve_context(data_context) @property def orcapod_config(self) -> Config: @@ -77,6 +91,7 @@ def orcapod_config(self) -> Config: def data_context(self) -> contexts.DataContext: return self._data_context + # TODO: re-evaluate whether changing data context should be allowed @data_context.setter def data_context(self, context: str | contexts.DataContext | None) -> None: self._data_context = contexts.resolve_context(context) @@ -98,14 +113,18 @@ class ContentIdentifiableBase(DataContextMixin, ABC): Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. """ - def __init__(self, **kwargs) -> None: + def __init__( + self, + data_context: str | contexts.DataContext | None = None, + orcapod_config: Config | None = None, + ) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. Args: identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. """ - super().__init__(**kwargs) + super().__init__(data_context=data_context, orcapod_config=orcapod_config) self._cached_content_hash: hp.ContentHash | None = None self._cached_int_hash: int | None = None @@ -225,6 +244,17 @@ class OrcapodBase(TemporalMixin, LabelableMixin, ContentIdentifiableBase): and modification timestamp. """ + def __init__( + self, + label: str | None = None, + data_context: str | contexts.DataContext | None = None, + orcapod_config: Config | None = None, + ): + # Init provided here for explicit listing of parmeters + super().__init__( + label=label, data_context=data_context, orcapod_config=orcapod_config + ) + def __repr__(self): return self.__class__.__name__ diff --git a/src/orcapod/core/datagrams/arrow_datagram.py b/src/orcapod/core/datagrams/arrow_datagram.py index 428c212..4ff1d43 100644 --- a/src/orcapod/core/datagrams/arrow_datagram.py +++ b/src/orcapod/core/datagrams/arrow_datagram.py @@ -4,9 +4,9 @@ from orcapod import contexts from orcapod.core.datagrams.base import BaseDatagram -from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.protocols.hashing_protocols import ContentHash +from orcapod.system_constants import constants from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -95,12 +95,13 @@ def __init__( ) # Extract context table from passed in table if present + # TODO: revisit the logic here if constants.CONTEXT_KEY in table.column_names and data_context is None: context_table = table.select([constants.CONTEXT_KEY]) data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] # Initialize base class with data context - super().__init__(data_context=data_context, record_id=record_id, **kwargs) + super().__init__(data_context=data_context, datagram_id=record_id, **kwargs) meta_columns = [ col for col in table.column_names if col.startswith(constants.META_PREFIX) @@ -777,9 +778,11 @@ def with_context_key(self, new_context_key: str) -> Self: return new_datagram # 8. Utility Operations - def copy(self, include_cache: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self: """Return a copy of the datagram.""" - new_datagram = super().copy() + new_datagram = super().copy( + include_cache=include_cache, preserve_id=preserve_id + ) new_datagram._data_table = self._data_table new_datagram._meta_table = self._meta_table diff --git a/src/orcapod/core/datagrams/arrow_tag_packet.py b/src/orcapod/core/datagrams/arrow_tag_packet.py index e64978d..d58feae 100644 --- a/src/orcapod/core/datagrams/arrow_tag_packet.py +++ b/src/orcapod/core/datagrams/arrow_tag_packet.py @@ -4,9 +4,9 @@ from orcapod import contexts from orcapod.core.datagrams.arrow_datagram import ArrowDatagram -from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.semantic_types import infer_python_schema_from_pylist_data +from orcapod.system_constants import constants from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -199,9 +199,9 @@ def system_tags(self) -> dict[str, DataValue | None]: return self._system_tags_dict.copy() # 8. Utility Operations - def copy(self, include_cache: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self: """Return a copy of the datagram.""" - new_tag = super().copy(include_cache=include_cache) + new_tag = super().copy(include_cache=include_cache, preserve_id=preserve_id) new_tag._system_tags_dict = self._system_tags_dict.copy() new_tag._system_tags_python_schema = self._system_tags_python_schema.copy() @@ -521,9 +521,9 @@ def with_columns( return new_packet # 8. Utility Operations - def copy(self, include_cache: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self: """Return a copy of the datagram.""" - new_packet = super().copy(include_cache=include_cache) + new_packet = super().copy(include_cache=include_cache, preserve_id=preserve_id) new_packet._source_info_table = self._source_info_table if include_cache: diff --git a/src/orcapod/core/datagrams/base.py b/src/orcapod/core/datagrams/base.py index 4a35732..5a291c1 100644 --- a/src/orcapod/core/datagrams/base.py +++ b/src/orcapod/core/datagrams/base.py @@ -20,7 +20,6 @@ from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping from typing import TYPE_CHECKING, Any, Self, TypeAlias -from uuid import UUID from uuid_utils import uuid7 @@ -121,18 +120,18 @@ class BaseDatagram(ContentIdentifiableBase): is interpreted and used is left to concrete implementations. """ - def __init__(self, record_id: str | None = None, **kwargs): + def __init__(self, datagram_id: str | None = None, **kwargs): super().__init__(**kwargs) - self._record_id = record_id + self._datagram_id = datagram_id @property - def record_id(self) -> str: + def datagram_id(self) -> str: """ Returns record ID """ - if self._record_id is None: - self._record_id = str(uuid7()) - return self._record_id + if self._datagram_id is None: + self._datagram_id = str(uuid7()) + return self._datagram_id # TODO: revisit handling of identity structure for datagrams def identity_structure(self) -> Any: @@ -283,13 +282,13 @@ def with_context_key(self, new_context_key: str) -> Self: return new_datagram # 8. Utility Operations - def copy(self, include_cache: bool = True, preserve_record_id: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self: """Create a shallow copy of the datagram.""" new_datagram = object.__new__(self.__class__) new_datagram._data_context = self._data_context - if preserve_record_id: - new_datagram._record_id = self._record_id + if preserve_id: + new_datagram._datagram_id = self._datagram_id else: - new_datagram._record_id = None + new_datagram._datagram_id = None return new_datagram diff --git a/src/orcapod/core/datagrams/dict_datagram.py b/src/orcapod/core/datagrams/dict_datagram.py index e7e4b60..2e835a4 100644 --- a/src/orcapod/core/datagrams/dict_datagram.py +++ b/src/orcapod/core/datagrams/dict_datagram.py @@ -4,10 +4,10 @@ from orcapod import contexts from orcapod.core.datagrams.base import BaseDatagram -from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.protocols.hashing_protocols import ContentHash from orcapod.semantic_types import infer_python_schema_from_pylist_data +from orcapod.system_constants import constants from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -99,7 +99,7 @@ def __init__( # Initialize base class with data context final_context = data_context or cast(str, extracted_context) - super().__init__(data_context=final_context, record_id=record_id, **kwargs) + super().__init__(data_context=final_context, datagram_id=record_id, **kwargs) # Store data and meta components separately (immutable) self._data = dict(data_columns) @@ -542,7 +542,7 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: ) # TODO: use copy instead - new_datagram._record_id = self._record_id + new_datagram._datagram_id = self._datagram_id return new_datagram @@ -771,7 +771,7 @@ def with_columns( return new_datagram # 8. Utility Operations - def copy(self, include_cache: bool = True, preserve_record_id: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self: """ Create a shallow copy of the datagram. @@ -783,7 +783,7 @@ def copy(self, include_cache: bool = True, preserve_record_id: bool = True) -> S New DictDatagram instance with copied data and caches. """ new_datagram = super().copy( - include_cache=include_cache, preserve_record_id=preserve_record_id + include_cache=include_cache, preserve_id=preserve_id ) new_datagram._data = self._data.copy() new_datagram._meta_data = self._meta_data.copy() diff --git a/src/orcapod/core/datagrams/dict_tag_packet.py b/src/orcapod/core/datagrams/dict_tag_packet.py index a53d9bf..ac49841 100644 --- a/src/orcapod/core/datagrams/dict_tag_packet.py +++ b/src/orcapod/core/datagrams/dict_tag_packet.py @@ -4,9 +4,9 @@ from orcapod import contexts from orcapod.core.datagrams.dict_datagram import DictDatagram -from orcapod.system_constants import constants from orcapod.protocols.core_protocols import ColumnConfig from orcapod.semantic_types import infer_python_schema_from_pylist_data +from orcapod.system_constants import constants from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -209,18 +209,20 @@ def system_tags(self) -> dict[str, DataValue]: """ return dict(self._system_tags) - def copy(self, include_cache: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_id: bool = False) -> Self: """Return a shallow copy of the packet.""" instance = super().copy(include_cache=include_cache) instance._system_tags = self._system_tags.copy() if include_cache: instance._cached_system_tags_table = self._cached_system_tags_table instance._cached_system_tags_schema = self._cached_system_tags_schema - else: instance._cached_system_tags_table = None instance._cached_system_tags_schema = None + if preserve_id: + instance._datagram_id = self._datagram_id + return instance @@ -485,9 +487,9 @@ def with_source_info(self, **source_info: str | None) -> Self: return new_packet - def copy(self, include_cache: bool = True) -> Self: + def copy(self, include_cache: bool = True, preserve_id: bool = True) -> Self: """Return a shallow copy of the packet.""" - instance = super().copy(include_cache=include_cache) + instance = super().copy(include_cache=include_cache, preserve_id=preserve_id) instance._source_info = self._source_info.copy() if include_cache: instance._cached_source_info_table = self._cached_source_info_table diff --git a/src/orcapod/core/execution_engine.py b/src/orcapod/core/execution_engine.py index 98a242c..7fa21ea 100644 --- a/src/orcapod/core/execution_engine.py +++ b/src/orcapod/core/execution_engine.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from collections.abc import Callable from typing import Any, Protocol, runtime_checkable diff --git a/src/orcapod/core/function_pod.py b/src/orcapod/core/function_pod.py index 31ed02f..9f90947 100644 --- a/src/orcapod/core/function_pod.py +++ b/src/orcapod/core/function_pod.py @@ -1,12 +1,13 @@ +from __future__ import annotations + import logging from collections.abc import Callable, Collection, Iterator from typing import TYPE_CHECKING, Any, Protocol, cast -from orcapod.protocols.database_protocols import ArrowDatabase -from orcapod.system_constants import constants + from orcapod import contexts from orcapod.core.base import OrcapodBase from orcapod.core.operators import Join -from orcapod.core.packet_function import PythonPacketFunction, CachedPacketFunction +from orcapod.core.packet_function import CachedPacketFunction, PythonPacketFunction from orcapod.core.streams.base import StreamBase from orcapod.core.tracker import DEFAULT_TRACKER_MANAGER from orcapod.protocols.core_protocols import ( @@ -19,6 +20,8 @@ Tag, TrackerManager, ) +from orcapod.protocols.database_protocols import ArrowDatabase +from orcapod.system_constants import constants from orcapod.types import PythonSchema from orcapod.utils import arrow_utils, schema_utils from orcapod.utils.lazy_module import LazyModule @@ -26,8 +29,8 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - import pyarrow as pa import polars as pl + import pyarrow as pa else: pa = LazyModule("pyarrow") pl = LazyModule("polars") @@ -573,7 +576,7 @@ def process_packet( self.add_pipeline_record( tag, packet, - packet_record_id=output_packet.record_id, + packet_record_id=output_packet.datagram_id, computed=result_computed, ) diff --git a/src/orcapod/core/streams/cached_pod_stream.py b/src/orcapod/core/legacy/cached_pod_stream.py similarity index 100% rename from src/orcapod/core/streams/cached_pod_stream.py rename to src/orcapod/core/legacy/cached_pod_stream.py diff --git a/src/orcapod/core/streams/lazy_pod_stream.py b/src/orcapod/core/legacy/lazy_pod_stream.py similarity index 100% rename from src/orcapod/core/streams/lazy_pod_stream.py rename to src/orcapod/core/legacy/lazy_pod_stream.py index aab5b65..5416976 100644 --- a/src/orcapod/core/streams/lazy_pod_stream.py +++ b/src/orcapod/core/legacy/lazy_pod_stream.py @@ -3,18 +3,18 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from orcapod.system_constants import constants +from orcapod.core.streams.base import StreamBase from orcapod.protocols import core_protocols as cp +from orcapod.system_constants import constants from orcapod.types import PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import StreamBase - if TYPE_CHECKING: - import pyarrow as pa - import polars as pl import asyncio + + import polars as pl + import pyarrow as pa else: pa = LazyModule("pyarrow") pl = LazyModule("polars") diff --git a/src/orcapod/core/legacy/pod_node_stream.py b/src/orcapod/core/legacy/pod_node_stream.py new file mode 100644 index 0000000..5d2c7b5 --- /dev/null +++ b/src/orcapod/core/legacy/pod_node_stream.py @@ -0,0 +1,424 @@ +# import logging +# from collections.abc import Iterator +# from typing import TYPE_CHECKING, Any + +# import orcapod.protocols.core_protocols.execution_engine +# from orcapod.contexts.system_constants import constants +# from orcapod.core.streams.base import StreamBase +# from orcapod.core.streams.table_stream import TableStream +# from orcapod.protocols import core_protocols as cp +# from orcapod.protocols import pipeline_protocols as pp +# from orcapod.types import PythonSchema +# from orcapod.utils import arrow_utils +# from orcapod.utils.lazy_module import LazyModule + +# if TYPE_CHECKING: +# import polars as pl +# import pyarrow as pa +# import pyarrow.compute as pc + +# else: +# pa = LazyModule("pyarrow") +# pc = LazyModule("pyarrow.compute") +# pl = LazyModule("polars") + + +# # TODO: consider using this instead of making copy of dicts +# # from types import MappingProxyType + +# logger = logging.getLogger(__name__) + + +# class PodNodeStream(StreamBase): +# """ +# A fixed stream that is both cached pod and pipeline storage aware +# """ + +# # TODO: define interface for storage or pod storage +# def __init__(self, pod_node: pp.PodNode, input_stream: cp.Stream, **kwargs): +# super().__init__(source=pod_node, upstreams=(input_stream,), **kwargs) +# self.pod_node = pod_node +# self.input_stream = input_stream + +# # capture the immutable iterator from the input stream +# self._prepared_stream_iterator = input_stream.iter_packets() +# self._set_modified_time() # set modified time to when we obtain the iterator + +# # Packet-level caching (from your PodStream) +# self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None +# self._cached_output_table: pa.Table | None = None +# self._cached_content_hash_column: pa.Array | None = None + +# def set_mode(self, mode: str) -> None: +# return self.pod_node.set_mode(mode) + +# @property +# def mode(self) -> str: +# return self.pod_node.mode + +# async def run_async( +# self, +# *args: Any, +# execution_engine_opts: dict[str, Any] | None = None, +# **kwargs: Any, +# ) -> None: +# """ +# Runs the stream, processing the input stream and preparing the output stream. +# This is typically called before iterating over the packets. +# """ +# if self._cached_output_packets is None: +# cached_results, missing = self._identify_existing_and_missing_entries( +# *args, +# execution_engine=execution_engine, +# execution_engine_opts=execution_engine_opts, +# **kwargs, +# ) + +# tag_keys = self.input_stream.keys()[0] + +# pending_calls = [] +# if missing is not None and missing.num_rows > 0: +# for tag, packet in TableStream(missing, tag_columns=tag_keys): +# # Since these packets are known to be missing, skip the cache lookup +# pending = self.pod_node.async_call( +# tag, +# packet, +# skip_cache_lookup=True, +# execution_engine=execution_engine or self.execution_engine, +# execution_engine_opts=execution_engine_opts +# or self._execution_engine_opts, +# ) +# pending_calls.append(pending) + +# import asyncio + +# completed_calls = await asyncio.gather(*pending_calls) +# for result in completed_calls: +# cached_results.append(result) + +# self.clear_cache() +# self._cached_output_packets = cached_results +# self._set_modified_time() +# self.pod_node.flush() + +# def _identify_existing_and_missing_entries( +# self, +# *args: Any, +# execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine +# | None = None, +# execution_engine_opts: dict[str, Any] | None = None, +# **kwargs: Any, +# ) -> tuple[list[tuple[cp.Tag, cp.Packet | None]], pa.Table | None]: +# cached_results: list[tuple[cp.Tag, cp.Packet | None]] = [] + +# # identify all entries in the input stream for which we still have not computed packets +# if len(args) > 0 or len(kwargs) > 0: +# input_stream_used = self.input_stream.polars_filter(*args, **kwargs) +# else: +# input_stream_used = self.input_stream + +# target_entries = input_stream_used.as_table( +# include_system_tags=True, +# include_source=True, +# include_content_hash=constants.INPUT_PACKET_HASH, +# execution_engine=execution_engine or self.execution_engine, +# execution_engine_opts=execution_engine_opts or self._execution_engine_opts, +# ) +# existing_entries = self.pod_node.get_all_cached_outputs( +# include_system_columns=True +# ) +# if ( +# existing_entries is None +# or existing_entries.num_rows == 0 +# or self.mode == "development" +# ): +# missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) +# existing = None +# else: +# # TODO: do more proper replacement operation +# target_df = pl.DataFrame(target_entries) +# existing_df = pl.DataFrame( +# existing_entries.append_column( +# "_exists", pa.array([True] * len(existing_entries)) +# ) +# ) +# all_results_df = target_df.join( +# existing_df, +# on=constants.INPUT_PACKET_HASH, +# how="left", +# suffix="_right", +# ) +# all_results = all_results_df.to_arrow() + +# missing = ( +# all_results.filter(pc.is_null(pc.field("_exists"))) +# .select(target_entries.column_names) +# .drop_columns([constants.INPUT_PACKET_HASH]) +# ) + +# existing = all_results.filter( +# pc.is_valid(pc.field("_exists")) +# ).drop_columns( +# [ +# "_exists", +# constants.INPUT_PACKET_HASH, +# constants.PACKET_RECORD_ID, +# *self.input_stream.keys()[1], # remove the input packet keys +# ] +# # TODO: look into NOT fetching back the record ID +# ) +# renamed = [ +# c.removesuffix("_right") if c.endswith("_right") else c +# for c in existing.column_names +# ] +# existing = existing.rename_columns(renamed) + +# tag_keys = self.input_stream.keys()[0] + +# if existing is not None and existing.num_rows > 0: +# # If there are existing entries, we can cache them +# # TODO: cache them based on the record ID +# existing_stream = TableStream(existing, tag_columns=tag_keys) +# for tag, packet in existing_stream.iter_packets(): +# cached_results.append((tag, packet)) + +# return cached_results, missing + +# def run( +# self, +# *args: Any, +# execution_engine: cp.ExecutionEngine | None = None, +# execution_engine_opts: dict[str, Any] | None = None, +# **kwargs: Any, +# ) -> None: +# tag_keys = self.input_stream.keys()[0] +# cached_results, missing = self._identify_existing_and_missing_entries( +# *args, +# execution_engine=execution_engine, +# execution_engine_opts=execution_engine_opts, +# **kwargs, +# ) + +# if missing is not None and missing.num_rows > 0: +# packet_record_to_output_lut: dict[str, cp.Packet | None] = {} +# execution_engine_hash = ( +# execution_engine.name if execution_engine is not None else "default" +# ) +# for tag, packet in TableStream(missing, tag_columns=tag_keys): +# # compute record id +# packet_record_id = self.pod_node.get_record_id( +# packet, execution_engine_hash=execution_engine_hash +# ) + +# # Since these packets are known to be missing, skip the cache lookup +# if packet_record_id in packet_record_to_output_lut: +# output_packet = packet_record_to_output_lut[packet_record_id] +# else: +# tag, output_packet = self.pod_node.call( +# tag, +# packet, +# record_id=packet_record_id, +# skip_cache_lookup=True, +# execution_engine=execution_engine or self.execution_engine, +# execution_engine_opts=execution_engine_opts +# or self._execution_engine_opts, +# ) +# packet_record_to_output_lut[packet_record_id] = output_packet +# self.pod_node.add_pipeline_record( +# tag, +# packet, +# packet_record_id, +# retrieved=False, +# skip_cache_lookup=True, +# ) +# cached_results.append((tag, output_packet)) + +# # reset the cache and set new results +# self.clear_cache() +# self._cached_output_packets = cached_results +# self._set_modified_time() +# self.pod_node.flush() +# # TODO: evaluate proper handling of cache here +# # self.clear_cache() + +# def clear_cache(self) -> None: +# self._cached_output_packets = None +# self._cached_output_table = None +# self._cached_content_hash_column = None + +# def iter_packets( +# self, +# execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine, +# execution_engine_opts: dict[str, Any] | None = None, +# ) -> Iterator[tuple[cp.Tag, cp.Packet]]: +# """ +# Processes the input stream and prepares the output stream. +# This is typically called before iterating over the packets. +# """ + +# # if results are cached, simply return from them +# if self._cached_output_packets is not None: +# for tag, packet in self._cached_output_packets: +# if packet is not None: +# # make sure to skip over an empty packet +# yield tag, packet +# else: +# cached_results = [] +# # prepare the cache by loading from the record +# total_table = self.pod_node.get_all_records(include_system_columns=True) +# if total_table is None: +# return # empty out +# tag_types, packet_types = self.pod_node.output_types() + +# for tag, packet in TableStream(total_table, tag_columns=tag_types.keys()): +# cached_results.append((tag, packet)) +# yield tag, packet + +# # come up with a better caching mechanism +# self._cached_output_packets = cached_results +# self._set_modified_time() + +# def keys( +# self, include_system_tags: bool = False +# ) -> tuple[tuple[str, ...], tuple[str, ...]]: +# """ +# Returns the keys of the tag and packet columns in the stream. +# This is useful for accessing the columns in the stream. +# """ + +# tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) +# packet_keys = tuple(self.pod_node.output_packet_types().keys()) +# return tag_keys, packet_keys + +# def types( +# self, include_system_tags: bool = False +# ) -> tuple[PythonSchema, PythonSchema]: +# tag_typespec, _ = self.input_stream.types( +# include_system_tags=include_system_tags +# ) +# # TODO: check if copying can be avoided +# packet_typespec = dict(self.pod_node.output_packet_types()) +# return tag_typespec, packet_typespec + +# def as_table( +# self, +# include_data_context: bool = False, +# include_source: bool = False, +# include_system_tags: bool = False, +# include_content_hash: bool | str = False, +# sort_by_tags: bool = True, +# execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine +# | None = None, +# execution_engine_opts: dict[str, Any] | None = None, +# ) -> "pa.Table": +# if self._cached_output_table is None: +# all_tags = [] +# all_packets = [] +# tag_schema, packet_schema = None, None +# for tag, packet in self.iter_packets( +# execution_engine=execution_engine or self.execution_engine, +# execution_engine_opts=execution_engine_opts +# or self._execution_engine_opts, +# ): +# if tag_schema is None: +# tag_schema = tag.arrow_schema(include_system_tags=True) +# if packet_schema is None: +# packet_schema = packet.arrow_schema( +# include_context=True, +# include_source=True, +# ) +# all_tags.append(tag.as_dict(include_system_tags=True)) +# # FIXME: using in the pinch conversion to str from path +# # replace with an appropriate semantic converter-based approach! +# dict_patcket = packet.as_dict(include_context=True, include_source=True) +# all_packets.append(dict_patcket) + +# converter = self.data_context.type_converter + +# if len(all_tags) == 0: +# tag_types, packet_types = self.pod_node.output_types( +# include_system_tags=True +# ) +# tag_schema = converter.python_schema_to_arrow_schema(tag_types) +# source_entries = { +# f"{constants.SOURCE_PREFIX}{c}": str for c in packet_types.keys() +# } +# packet_types.update(source_entries) +# packet_types[constants.CONTEXT_KEY] = str +# packet_schema = converter.python_schema_to_arrow_schema(packet_types) +# total_schema = arrow_utils.join_arrow_schemas(tag_schema, packet_schema) +# # return an empty table with the right schema +# self._cached_output_table = pa.Table.from_pylist( +# [], schema=total_schema +# ) +# else: +# struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + +# all_tags_as_tables: pa.Table = pa.Table.from_pylist( +# all_tags, schema=tag_schema +# ) +# all_packets_as_tables: pa.Table = pa.Table.from_pylist( +# struct_packets, schema=packet_schema +# ) + +# self._cached_output_table = arrow_utils.hstack_tables( +# all_tags_as_tables, all_packets_as_tables +# ) +# assert self._cached_output_table is not None, ( +# "_cached_output_table should not be None here." +# ) + +# if self._cached_output_table.num_rows == 0: +# return self._cached_output_table +# drop_columns = [] +# if not include_source: +# drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) +# if not include_data_context: +# drop_columns.append(constants.CONTEXT_KEY) +# if not include_system_tags: +# # TODO: come up with a more efficient approach +# drop_columns.extend( +# [ +# c +# for c in self._cached_output_table.column_names +# if c.startswith(constants.SYSTEM_TAG_PREFIX) +# ] +# ) + +# output_table = self._cached_output_table.drop_columns(drop_columns) + +# # lazily prepare content hash column if requested +# if include_content_hash: +# if self._cached_content_hash_column is None: +# content_hashes = [] +# for tag, packet in self.iter_packets( +# execution_engine=execution_engine or self.execution_engine, +# execution_engine_opts=execution_engine_opts +# or self._execution_engine_opts, +# ): +# content_hashes.append(packet.content_hash().to_string()) +# self._cached_content_hash_column = pa.array( +# content_hashes, type=pa.large_string() +# ) +# assert self._cached_content_hash_column is not None, ( +# "_cached_content_hash_column should not be None here." +# ) +# hash_column_name = ( +# "_content_hash" +# if include_content_hash is True +# else include_content_hash +# ) +# output_table = output_table.append_column( +# hash_column_name, self._cached_content_hash_column +# ) + +# if sort_by_tags: +# try: +# # TODO: consider having explicit tag/packet properties? +# output_table = output_table.sort_by( +# [(column, "ascending") for column in self.keys()[0]] +# ) +# except pa.ArrowTypeError: +# pass + +# return output_table diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/legacy/pods.py similarity index 100% rename from src/orcapod/core/pods.py rename to src/orcapod/core/legacy/pods.py diff --git a/src/orcapod/core/operators/base.py b/src/orcapod/core/operators/base.py index 07b6ed2..0a84aae 100644 --- a/src/orcapod/core/operators/base.py +++ b/src/orcapod/core/operators/base.py @@ -2,12 +2,12 @@ from collections.abc import Collection from typing import Any -from orcapod.core.executable_pod import ExecutablePod +from orcapod.core.static_output_pod import StaticOutputPod from orcapod.protocols.core_protocols import ArgumentGroup, ColumnConfig, Stream from orcapod.types import PythonSchema -class Operator(ExecutablePod): +class Operator(StaticOutputPod): """ Base class for all operators. Operators are basic pods that can be used to perform operations on streams. @@ -34,7 +34,7 @@ def validate_unary_input(self, stream: Stream) -> None: ... @abstractmethod - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: """ This method should be implemented by subclasses to define the specific behavior of the unary operator. It takes one stream as input and returns a new stream as output. @@ -61,13 +61,13 @@ def validate_inputs(self, *streams: Stream) -> None: stream = streams[0] return self.validate_unary_input(stream) - def execute(self, *streams: Stream) -> Stream: + def static_process(self, *streams: Stream) -> Stream: """ Forward method for unary operators. It expects exactly one stream as input. """ stream = streams[0] - return self.unary_execute(stream) + return self.unary_static_process(stream) def output_schema( self, @@ -97,7 +97,9 @@ def validate_binary_inputs(self, left_stream: Stream, right_stream: Stream) -> N ... @abstractmethod - def binary_execute(self, left_stream: Stream, right_stream: Stream) -> Stream: + def binary_static_process( + self, left_stream: Stream, right_stream: Stream + ) -> Stream: """ Forward method for binary operators. It expects exactly two streams as input. diff --git a/src/orcapod/core/operators/batch.py b/src/orcapod/core/operators/batch.py index 83dc270..d8edb49 100644 --- a/src/orcapod/core/operators/batch.py +++ b/src/orcapod/core/operators/batch.py @@ -35,7 +35,7 @@ def validate_unary_input(self, stream: Stream) -> None: """ return None - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: """ This method should be implemented by subclasses to define the specific behavior of the binary operator. It takes two streams as input and returns a new stream as output. diff --git a/src/orcapod/core/operators/column_selection.py b/src/orcapod/core/operators/column_selection.py index 9bea9a7..851dadd 100644 --- a/src/orcapod/core/operators/column_selection.py +++ b/src/orcapod/core/operators/column_selection.py @@ -4,9 +4,9 @@ from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream -from orcapod.system_constants import constants from orcapod.errors import InputValidationError from orcapod.protocols.core_protocols import ColumnConfig, Stream +from orcapod.system_constants import constants from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule @@ -30,7 +30,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() tags_to_drop = [c for c in tag_columns if c not in self.columns] new_tag_columns = [c for c in tag_columns if c not in tags_to_drop] @@ -104,7 +104,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() packet_columns_to_drop = [c for c in packet_columns if c not in self.columns] new_packet_columns = [ @@ -187,7 +187,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() columns_to_drop = self.columns if not self.strict: @@ -263,7 +263,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() columns_to_drop = list(self.columns) if not self.strict: diff --git a/src/orcapod/core/operators/filters.py b/src/orcapod/core/operators/filters.py index 0e3bbb2..fb10689 100644 --- a/src/orcapod/core/operators/filters.py +++ b/src/orcapod/core/operators/filters.py @@ -4,9 +4,9 @@ from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream -from orcapod.system_constants import constants from orcapod.errors import InputValidationError from orcapod.protocols.core_protocols import ColumnConfig, Stream +from orcapod.system_constants import constants from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule @@ -42,7 +42,7 @@ def __init__( self.constraints = constraints if constraints is not None else {} super().__init__(**kwargs) - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: if len(self.predicates) == 0 and len(self.constraints) == 0: logger.info( "No predicates or constraints specified. Returning stream unaltered." @@ -102,7 +102,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs self.strict = strict super().__init__(**kwargs) - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() packet_columns_to_drop = [c for c in packet_columns if c not in self.columns] new_packet_columns = [ diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py index 55901ff..f9cf39f 100644 --- a/src/orcapod/core/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -78,7 +78,7 @@ def output_schema( return tag_typespec, packet_typespec - def execute(self, *streams: Stream) -> Stream: + def static_process(self, *streams: Stream) -> Stream: """ Joins two streams together based on their tags. The resulting stream will contain all the tags from both streams. diff --git a/src/orcapod/core/operators/mappers.py b/src/orcapod/core/operators/mappers.py index d2c2368..e15e5c2 100644 --- a/src/orcapod/core/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -3,9 +3,9 @@ from orcapod.core.operators.base import UnaryOperator from orcapod.core.streams import TableStream -from orcapod.system_constants import constants from orcapod.errors import InputValidationError from orcapod.protocols.core_protocols import ColumnConfig, Stream +from orcapod.system_constants import constants from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule @@ -29,7 +29,7 @@ def __init__( self.drop_unmapped = drop_unmapped super().__init__(**kwargs) - def unary_execute(self, stream: Stream) -> Stream: + def unary_static_process(self, stream: Stream) -> Stream: tag_columns, packet_columns = stream.keys() unmapped_columns = set(packet_columns) - set(self.name_map.keys()) diff --git a/src/orcapod/core/operators/semijoin.py b/src/orcapod/core/operators/semijoin.py index 5049409..e2e3232 100644 --- a/src/orcapod/core/operators/semijoin.py +++ b/src/orcapod/core/operators/semijoin.py @@ -28,7 +28,9 @@ class SemiJoin(BinaryOperator): The output stream preserves the schema of the left stream exactly. """ - def binary_execute(self, left_stream: Stream, right_stream: Stream) -> Stream: + def binary_static_process( + self, left_stream: Stream, right_stream: Stream + ) -> Stream: """ Performs a semi-join between left and right streams. Returns entries from left stream that have matching entries in right stream. diff --git a/src/orcapod/core/packet_function.py b/src/orcapod/core/packet_function.py index ab4deac..b0bed05 100644 --- a/src/orcapod/core/packet_function.py +++ b/src/orcapod/core/packet_function.py @@ -1,27 +1,46 @@ +from __future__ import annotations + import hashlib import logging import re import sys from abc import abstractmethod from collections.abc import Callable, Collection, Iterable, Sequence +from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Literal from uuid_utils import uuid7 +from orcapod.config import Config +from orcapod.contexts import DataContext from orcapod.core.base import OrcapodBase -from orcapod.core.datagrams import DictPacket, ArrowPacket +from orcapod.core.datagrams import ArrowPacket, DictPacket from orcapod.hashing.hash_utils import get_function_components, get_function_signature -from orcapod.protocols.core_protocols import Packet, PacketFunction, Tag, Stream +from orcapod.protocols.core_protocols import Packet, PacketFunction +from orcapod.protocols.database_protocols import ArrowDatabase +from orcapod.system_constants import constants from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import schema_utils from orcapod.utils.git_utils import get_git_info_for_python_object from orcapod.utils.lazy_module import LazyModule -from orcapod.protocols.database_protocols import ArrowDatabase -from orcapod.system_constants import constants -from datetime import datetime, timezone + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + +logger = logging.getLogger(__name__) + +error_handling_options = Literal["raise", "ignore", "warn"] -def process_function_output(self, values: Any) -> dict[str, DataValue]: +def parse_function_outputs(self, values: Any) -> dict[str, DataValue]: + """ + Process the output of a function and return a dictionary of DataValues, correctly parsing + the output based on expected number of values. + """ output_values = [] if len(self.output_keys) == 0: output_values = [] @@ -65,25 +84,21 @@ def combine_hashes( return combined_hash -if TYPE_CHECKING: - import pyarrow as pa - import pyarrow.compute as pc -else: - pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") - -logger = logging.getLogger(__name__) - -error_handling_options = Literal["raise", "ignore", "warn"] - - class PacketFunctionBase(OrcapodBase): """ Abstract base class for PacketFunction, defining the interface and common functionality. """ - def __init__(self, version: str = "v0.0", **kwargs): - super().__init__(**kwargs) + def __init__( + self, + version: str = "v0.0", + label: str | None = None, + data_context: str | DataContext | None = None, + orcapod_config: Config | None = None, + ): + super().__init__( + label=label, data_context=data_context, orcapod_config=orcapod_config + ) self._active = True self._version = version @@ -123,7 +138,8 @@ def minor_version_string(self) -> str: @abstractmethod def packet_function_type_id(self) -> str: """ - Unique function type identifier + Unique function type identifier. This identifier is used for equivalence checks. + e.g. "python.function.v1" """ ... @@ -542,7 +558,7 @@ def record_packet( self._result_database.add_record( self.record_path, - output_packet.record_id, + output_packet.datagram_id, data_table, skip_duplicates=skip_duplicates, ) diff --git a/src/orcapod/core/executable_pod.py b/src/orcapod/core/static_output_pod.py similarity index 91% rename from src/orcapod/core/executable_pod.py rename to src/orcapod/core/static_output_pod.py index cdeab99..8832c65 100644 --- a/src/orcapod/core/executable_pod.py +++ b/src/orcapod/core/static_output_pod.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging from abc import abstractmethod from collections.abc import Collection, Iterator @@ -27,11 +29,14 @@ pa = LazyModule("pyarrow") -class ExecutablePod(OrcapodBase): +class StaticOutputPod(OrcapodBase): """ - Abstract Base class for all pods that requires execution to generate - static output stream. The output stream will reexecute the pod as necessary - to keep the output stream current. + Abstract Base class for basic pods with core logic that yields static output stream. + The static output stream will be wrapped in DynamicPodStream which will re-execute + the pod as necessary to ensure that the output stream is up-to-date. + + Furthermore, the invocation of the pod will be tracked by the tracker manager, registering + the pod as a general pod invocation. """ def __init__(self, tracker_manager: TrackerManager | None = None, **kwargs) -> None: @@ -42,7 +47,7 @@ def __init__(self, tracker_manager: TrackerManager | None = None, **kwargs) -> N def uri(self) -> tuple[str, ...]: """ Returns a unique resource identifier for the pod. - The pod URI must uniquely determine the necessary schema for the pod's information + The pod URI must uniquely determine the schema for the pod """ return ( f"{self.__class__.__name__}", @@ -124,7 +129,7 @@ def output_schema( ... @abstractmethod - def execute(self, *streams: Stream) -> Stream: + def static_process(self, *streams: Stream) -> Stream: """ Executes the pod on the input streams, returning a new static output stream. The output of execute is expected to be a static stream and thus only represent @@ -141,7 +146,7 @@ def execute(self, *streams: Stream) -> Stream: """ ... - def process(self, *streams: Stream, label: str | None = None) -> Stream: + def process(self, *streams: Stream, label: str | None = None) -> DynamicPodStream: """ Invoke the pod on a collection of streams, returning a KernelStream that represents the computation. @@ -157,13 +162,13 @@ def process(self, *streams: Stream, label: str | None = None) -> Stream: # perform input stream validation self.validate_inputs(*streams) self.tracker_manager.record_pod_invocation(self, upstreams=streams, label=label) - output_stream = ExecutablePodStream( + output_stream = DynamicPodStream( pod=self, upstreams=streams, ) return output_stream - def __call__(self, *streams: Stream, **kwargs) -> Stream: + def __call__(self, *streams: Stream, **kwargs) -> DynamicPodStream: """ Convenience method to invoke the pod process on a collection of streams, """ @@ -172,7 +177,7 @@ def __call__(self, *streams: Stream, **kwargs) -> Stream: return self.process(*streams, **kwargs) -class ExecutablePodStream(StreamBase): +class DynamicPodStream(StreamBase): """ Recomputable stream wrapping a PodBase @@ -184,7 +189,7 @@ class ExecutablePodStream(StreamBase): def __init__( self, - pod: ExecutablePod, + pod: StaticOutputPod, upstreams: tuple[ Stream, ... ] = (), # if provided, this will override the upstreams of the output_stream @@ -276,7 +281,7 @@ def run(self, *args: Any, **kwargs: Any) -> None: # recompute if cache is invalid if self._cached_time is None or self._cached_stream is None: - self._cached_stream = self._pod.execute( + self._cached_stream = self._pod.static_process( *self.upstreams, ) self._cached_time = datetime.now() diff --git a/src/orcapod/core/streams/__init__.py b/src/orcapod/core/streams/__init__.py index 2004bbe..6fb3105 100644 --- a/src/orcapod/core/streams/__init__.py +++ b/src/orcapod/core/streams/__init__.py @@ -1,19 +1,7 @@ -# from .base import StatefulStreamBase -# from .pod_stream import KernelStream -from .table_stream import TableStream - -# from .packet_processor_stream import LazyPodResultStream -# from .cached_packet_processor_stream import CachedPodStream -# from .wrapped_stream import WrappedStream -# from .pod_node_stream import PodNodeStream - +from orcapod.core.streams.base import StreamBase +from orcapod.core.streams.table_stream import TableStream __all__ = [ - "StatefulStreamBase", - "KernelStream", + "StreamBase", "TableStream", - "LazyPodResultStream", - "CachedPodStream", - "WrappedStream", - "PodNodeStream", ] diff --git a/src/orcapod/core/streams/base.py b/src/orcapod/core/streams/base.py index 819ce96..a17447b 100644 --- a/src/orcapod/core/streams/base.py +++ b/src/orcapod/core/streams/base.py @@ -1,19 +1,17 @@ -from calendar import c import logging from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping from typing import TYPE_CHECKING, Any from orcapod.core.base import OrcapodBase -from orcapod.protocols.core_protocols import Pod, Stream, Tag, Packet, ColumnConfig +from orcapod.protocols.core_protocols import ColumnConfig, Packet, Pod, Stream, Tag from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule - if TYPE_CHECKING: - import pyarrow as pa - import polars as pl import pandas as pd + import polars as pl + import pyarrow as pa else: pa = LazyModule("pyarrow") pl = LazyModule("polars") @@ -289,11 +287,11 @@ def flow( def _repr_html_(self) -> str: df = self.as_polars_df() # reorder columns - new_column_order = [c for c in df.columns if c in self.tag_keys()] + [ - c for c in df.columns if c not in self.tag_keys() + new_column_order = [c for c in df.columns if c in self.keys()[0]] + [ + c for c in df.columns if c not in self.keys()[0] ] df = df[new_column_order] - tag_map = {t: f"*{t}" for t in self.tag_keys()} + tag_map = {t: f"*{t}" for t in self.keys()[0]} # TODO: construct repr html better df = df.rename(tag_map) return f"{self.__class__.__name__}[{self.label}]\n" + df._repr_html_() diff --git a/src/orcapod/core/streams/pod_node_stream.py b/src/orcapod/core/streams/pod_node_stream.py deleted file mode 100644 index 931a5c6..0000000 --- a/src/orcapod/core/streams/pod_node_stream.py +++ /dev/null @@ -1,424 +0,0 @@ -import logging -from collections.abc import Iterator -from typing import TYPE_CHECKING, Any - -import orcapod.protocols.core_protocols.execution_engine -from orcapod.contexts.system_constants import constants -from orcapod.core.streams.base import StreamBase -from orcapod.core.streams.table_stream import TableStream -from orcapod.protocols import core_protocols as cp -from orcapod.protocols import pipeline_protocols as pp -from orcapod.types import PythonSchema -from orcapod.utils import arrow_utils -from orcapod.utils.lazy_module import LazyModule - -if TYPE_CHECKING: - import polars as pl - import pyarrow as pa - import pyarrow.compute as pc - -else: - pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") - pl = LazyModule("polars") - - -# TODO: consider using this instead of making copy of dicts -# from types import MappingProxyType - -logger = logging.getLogger(__name__) - - -class PodNodeStream(StreamBase): - """ - A fixed stream that is both cached pod and pipeline storage aware - """ - - # TODO: define interface for storage or pod storage - def __init__(self, pod_node: pp.PodNode, input_stream: cp.Stream, **kwargs): - super().__init__(source=pod_node, upstreams=(input_stream,), **kwargs) - self.pod_node = pod_node - self.input_stream = input_stream - - # capture the immutable iterator from the input stream - self._prepared_stream_iterator = input_stream.iter_packets() - self._set_modified_time() # set modified time to when we obtain the iterator - - # Packet-level caching (from your PodStream) - self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None - self._cached_output_table: pa.Table | None = None - self._cached_content_hash_column: pa.Array | None = None - - def set_mode(self, mode: str) -> None: - return self.pod_node.set_mode(mode) - - @property - def mode(self) -> str: - return self.pod_node.mode - - async def run_async( - self, - *args: Any, - execution_engine_opts: dict[str, Any] | None = None, - **kwargs: Any, - ) -> None: - """ - Runs the stream, processing the input stream and preparing the output stream. - This is typically called before iterating over the packets. - """ - if self._cached_output_packets is None: - cached_results, missing = self._identify_existing_and_missing_entries( - *args, - execution_engine=execution_engine, - execution_engine_opts=execution_engine_opts, - **kwargs, - ) - - tag_keys = self.input_stream.keys()[0] - - pending_calls = [] - if missing is not None and missing.num_rows > 0: - for tag, packet in TableStream(missing, tag_columns=tag_keys): - # Since these packets are known to be missing, skip the cache lookup - pending = self.pod_node.async_call( - tag, - packet, - skip_cache_lookup=True, - execution_engine=execution_engine or self.execution_engine, - execution_engine_opts=execution_engine_opts - or self._execution_engine_opts, - ) - pending_calls.append(pending) - - import asyncio - - completed_calls = await asyncio.gather(*pending_calls) - for result in completed_calls: - cached_results.append(result) - - self.clear_cache() - self._cached_output_packets = cached_results - self._set_modified_time() - self.pod_node.flush() - - def _identify_existing_and_missing_entries( - self, - *args: Any, - execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine - | None = None, - execution_engine_opts: dict[str, Any] | None = None, - **kwargs: Any, - ) -> tuple[list[tuple[cp.Tag, cp.Packet | None]], pa.Table | None]: - cached_results: list[tuple[cp.Tag, cp.Packet | None]] = [] - - # identify all entries in the input stream for which we still have not computed packets - if len(args) > 0 or len(kwargs) > 0: - input_stream_used = self.input_stream.polars_filter(*args, **kwargs) - else: - input_stream_used = self.input_stream - - target_entries = input_stream_used.as_table( - include_system_tags=True, - include_source=True, - include_content_hash=constants.INPUT_PACKET_HASH, - execution_engine=execution_engine or self.execution_engine, - execution_engine_opts=execution_engine_opts or self._execution_engine_opts, - ) - existing_entries = self.pod_node.get_all_cached_outputs( - include_system_columns=True - ) - if ( - existing_entries is None - or existing_entries.num_rows == 0 - or self.mode == "development" - ): - missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) - existing = None - else: - # TODO: do more proper replacement operation - target_df = pl.DataFrame(target_entries) - existing_df = pl.DataFrame( - existing_entries.append_column( - "_exists", pa.array([True] * len(existing_entries)) - ) - ) - all_results_df = target_df.join( - existing_df, - on=constants.INPUT_PACKET_HASH, - how="left", - suffix="_right", - ) - all_results = all_results_df.to_arrow() - - missing = ( - all_results.filter(pc.is_null(pc.field("_exists"))) - .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH]) - ) - - existing = all_results.filter( - pc.is_valid(pc.field("_exists")) - ).drop_columns( - [ - "_exists", - constants.INPUT_PACKET_HASH, - constants.PACKET_RECORD_ID, - *self.input_stream.keys()[1], # remove the input packet keys - ] - # TODO: look into NOT fetching back the record ID - ) - renamed = [ - c.removesuffix("_right") if c.endswith("_right") else c - for c in existing.column_names - ] - existing = existing.rename_columns(renamed) - - tag_keys = self.input_stream.keys()[0] - - if existing is not None and existing.num_rows > 0: - # If there are existing entries, we can cache them - # TODO: cache them based on the record ID - existing_stream = TableStream(existing, tag_columns=tag_keys) - for tag, packet in existing_stream.iter_packets(): - cached_results.append((tag, packet)) - - return cached_results, missing - - def run( - self, - *args: Any, - execution_engine: cp.ExecutionEngine | None = None, - execution_engine_opts: dict[str, Any] | None = None, - **kwargs: Any, - ) -> None: - tag_keys = self.input_stream.keys()[0] - cached_results, missing = self._identify_existing_and_missing_entries( - *args, - execution_engine=execution_engine, - execution_engine_opts=execution_engine_opts, - **kwargs, - ) - - if missing is not None and missing.num_rows > 0: - packet_record_to_output_lut: dict[str, cp.Packet | None] = {} - execution_engine_hash = ( - execution_engine.name if execution_engine is not None else "default" - ) - for tag, packet in TableStream(missing, tag_columns=tag_keys): - # compute record id - packet_record_id = self.pod_node.get_record_id( - packet, execution_engine_hash=execution_engine_hash - ) - - # Since these packets are known to be missing, skip the cache lookup - if packet_record_id in packet_record_to_output_lut: - output_packet = packet_record_to_output_lut[packet_record_id] - else: - tag, output_packet = self.pod_node.call( - tag, - packet, - record_id=packet_record_id, - skip_cache_lookup=True, - execution_engine=execution_engine or self.execution_engine, - execution_engine_opts=execution_engine_opts - or self._execution_engine_opts, - ) - packet_record_to_output_lut[packet_record_id] = output_packet - self.pod_node.add_pipeline_record( - tag, - packet, - packet_record_id, - retrieved=False, - skip_cache_lookup=True, - ) - cached_results.append((tag, output_packet)) - - # reset the cache and set new results - self.clear_cache() - self._cached_output_packets = cached_results - self._set_modified_time() - self.pod_node.flush() - # TODO: evaluate proper handling of cache here - # self.clear_cache() - - def clear_cache(self) -> None: - self._cached_output_packets = None - self._cached_output_table = None - self._cached_content_hash_column = None - - def iter_packets( - self, - execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine, - execution_engine_opts: dict[str, Any] | None = None, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - """ - Processes the input stream and prepares the output stream. - This is typically called before iterating over the packets. - """ - - # if results are cached, simply return from them - if self._cached_output_packets is not None: - for tag, packet in self._cached_output_packets: - if packet is not None: - # make sure to skip over an empty packet - yield tag, packet - else: - cached_results = [] - # prepare the cache by loading from the record - total_table = self.pod_node.get_all_records(include_system_columns=True) - if total_table is None: - return # empty out - tag_types, packet_types = self.pod_node.output_types() - - for tag, packet in TableStream(total_table, tag_columns=tag_types.keys()): - cached_results.append((tag, packet)) - yield tag, packet - - # come up with a better caching mechanism - self._cached_output_packets = cached_results - self._set_modified_time() - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - - tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) - packet_keys = tuple(self.pod_node.output_packet_types().keys()) - return tag_keys, packet_keys - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, _ = self.input_stream.types( - include_system_tags=include_system_tags - ) - # TODO: check if copying can be avoided - packet_typespec = dict(self.pod_node.output_packet_types()) - return tag_typespec, packet_typespec - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: orcapod.protocols.core_protocols.execution_engine.ExecutionEngine - | None = None, - execution_engine_opts: dict[str, Any] | None = None, - ) -> "pa.Table": - if self._cached_output_table is None: - all_tags = [] - all_packets = [] - tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets( - execution_engine=execution_engine or self.execution_engine, - execution_engine_opts=execution_engine_opts - or self._execution_engine_opts, - ): - if tag_schema is None: - tag_schema = tag.arrow_schema(include_system_tags=True) - if packet_schema is None: - packet_schema = packet.arrow_schema( - include_context=True, - include_source=True, - ) - all_tags.append(tag.as_dict(include_system_tags=True)) - # FIXME: using in the pinch conversion to str from path - # replace with an appropriate semantic converter-based approach! - dict_patcket = packet.as_dict(include_context=True, include_source=True) - all_packets.append(dict_patcket) - - converter = self.data_context.type_converter - - if len(all_tags) == 0: - tag_types, packet_types = self.pod_node.output_types( - include_system_tags=True - ) - tag_schema = converter.python_schema_to_arrow_schema(tag_types) - source_entries = { - f"{constants.SOURCE_PREFIX}{c}": str for c in packet_types.keys() - } - packet_types.update(source_entries) - packet_types[constants.CONTEXT_KEY] = str - packet_schema = converter.python_schema_to_arrow_schema(packet_types) - total_schema = arrow_utils.join_arrow_schemas(tag_schema, packet_schema) - # return an empty table with the right schema - self._cached_output_table = pa.Table.from_pylist( - [], schema=total_schema - ) - else: - struct_packets = converter.python_dicts_to_struct_dicts(all_packets) - - all_tags_as_tables: pa.Table = pa.Table.from_pylist( - all_tags, schema=tag_schema - ) - all_packets_as_tables: pa.Table = pa.Table.from_pylist( - struct_packets, schema=packet_schema - ) - - self._cached_output_table = arrow_utils.hstack_tables( - all_tags_as_tables, all_packets_as_tables - ) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." - ) - - if self._cached_output_table.num_rows == 0: - return self._cached_output_table - drop_columns = [] - if not include_source: - drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) - if not include_data_context: - drop_columns.append(constants.CONTEXT_KEY) - if not include_system_tags: - # TODO: come up with a more efficient approach - drop_columns.extend( - [ - c - for c in self._cached_output_table.column_names - if c.startswith(constants.SYSTEM_TAG_PREFIX) - ] - ) - - output_table = self._cached_output_table.drop_columns(drop_columns) - - # lazily prepare content hash column if requested - if include_content_hash: - if self._cached_content_hash_column is None: - content_hashes = [] - for tag, packet in self.iter_packets( - execution_engine=execution_engine or self.execution_engine, - execution_engine_opts=execution_engine_opts - or self._execution_engine_opts, - ): - content_hashes.append(packet.content_hash().to_string()) - self._cached_content_hash_column = pa.array( - content_hashes, type=pa.large_string() - ) - assert self._cached_content_hash_column is not None, ( - "_cached_content_hash_column should not be None here." - ) - hash_column_name = ( - "_content_hash" - if include_content_hash is True - else include_content_hash - ) - output_table = output_table.append_column( - hash_column_name, self._cached_content_hash_column - ) - - if sort_by_tags: - try: - # TODO: consider having explicit tag/packet properties? - output_table = output_table.sort_by( - [(column, "ascending") for column in self.keys()[0]] - ) - except pa.ArrowTypeError: - pass - - return output_table diff --git a/src/orcapod/core/streams/table_stream.py b/src/orcapod/core/streams/table_stream.py index 94e498a..83c8a65 100644 --- a/src/orcapod/core/streams/table_stream.py +++ b/src/orcapod/core/streams/table_stream.py @@ -9,13 +9,12 @@ ArrowTag, DictTag, ) +from orcapod.core.streams.base import StreamBase +from orcapod.protocols.core_protocols import ColumnConfig, Pod, Stream, Tag from orcapod.system_constants import constants -from orcapod.protocols.core_protocols import Pod, Tag, Stream, ColumnConfig - from orcapod.types import PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.core.streams.base import StreamBase if TYPE_CHECKING: import pyarrow as pa diff --git a/src/orcapod/core/tracker.py b/src/orcapod/core/tracker.py index 2a78ae7..49b09a6 100644 --- a/src/orcapod/core/tracker.py +++ b/src/orcapod/core/tracker.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Generator @@ -114,21 +116,24 @@ def is_active(self) -> bool: return self._active @abstractmethod - def record_kernel_invocation( + def record_pod_invocation( self, - kernel: cp.Pod, + pod: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: ... @abstractmethod - def record_source_invocation( - self, source: cp.SourcePod, label: str | None = None + def record_source_pod_invocation( + self, source_pod: cp.SourcePod, label: str | None = None ) -> None: ... @abstractmethod - def record_pod_invocation( - self, pod: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None + def record_packet_function_invocation( + self, + packet_function: cp.PacketFunction, + input_stream: cp.Stream, + label: str | None = None, ) -> None: ... def __enter__(self): @@ -184,7 +189,7 @@ def identity_structure(self) -> Any: # if no upstreams, then we want to identify the source directly if not self.upstreams: return self.kernel.identity_structure() - return self.kernel.identity_structure(self.upstreams) + return self.kernel.identity_structure() def __repr__(self) -> str: return f"Invocation(kernel={self.kernel}, upstreams={self.upstreams}, label={self.label})" diff --git a/src/orcapod/databases/basic_delta_lake_arrow_database.py b/src/orcapod/databases/basic_delta_lake_arrow_database.py index 412d247..9781b2b 100644 --- a/src/orcapod/databases/basic_delta_lake_arrow_database.py +++ b/src/orcapod/databases/basic_delta_lake_arrow_database.py @@ -1,8 +1,7 @@ import logging from collections import defaultdict -from collections.abc import Collection, Mapping from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, cast from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError diff --git a/src/orcapod/protocols/core_protocols/datagrams.py b/src/orcapod/protocols/core_protocols/datagrams.py index 5e6114f..84a2264 100644 --- a/src/orcapod/protocols/core_protocols/datagrams.py +++ b/src/orcapod/protocols/core_protocols/datagrams.py @@ -9,7 +9,7 @@ runtime_checkable, ) -from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.protocols.hashing_protocols import ContentIdentifiable, DataContextAware from orcapod.types import DataType, DataValue, PythonSchema if TYPE_CHECKING: @@ -151,7 +151,7 @@ def handle_config( @runtime_checkable -class Datagram(ContentIdentifiable, Protocol): +class Datagram(ContentIdentifiable, DataContextAware, Protocol): """ Protocol for immutable datagram containers in Orcapod. @@ -178,7 +178,7 @@ class Datagram(ContentIdentifiable, Protocol): """ @property - def record_id(self) -> str: + def datagram_id(self) -> str: """ Return the UUID of this datagram. @@ -187,23 +187,6 @@ def record_id(self) -> str: """ ... - # 1. Core Properties (Identity & Structure) - @property - def data_context_key(self) -> str: - """ - Return the data context key for this datagram. - - This key identifies a collection of system components that collectively controls - how information is serialized, hashed and represented, including the semantic type registry, - arrow data hasher, and other contextual information. Same piece of information (that is two datagrams - with an identical *logical* content) may bear distinct internal representation if they are - represented under two distinct data context, as signified by distinct data context keys. - - Returns: - str: Context key for proper datagram interpretation - """ - ... - @property def meta_columns(self) -> tuple[str, ...]: """Return tuple of meta column names (with {constants.META_PREFIX} ('__') prefix).""" diff --git a/src/orcapod/protocols/core_protocols/function_pod.py b/src/orcapod/protocols/core_protocols/function_pod.py index 2b6108b..31e5f1c 100644 --- a/src/orcapod/protocols/core_protocols/function_pod.py +++ b/src/orcapod/protocols/core_protocols/function_pod.py @@ -9,8 +9,6 @@ class FunctionPod(Pod, Protocol): """ Pod based on PacketFunction. - - """ @property diff --git a/src/orcapod/protocols/core_protocols/orcapod_object.py b/src/orcapod/protocols/core_protocols/orcapod_object.py new file mode 100644 index 0000000..acefb75 --- /dev/null +++ b/src/orcapod/protocols/core_protocols/orcapod_object.py @@ -0,0 +1,11 @@ +from typing import Protocol + +from orcapod.protocols.core_protocols.labelable import Labelable +from orcapod.protocols.core_protocols.temporal import Temporal +from orcapod.protocols.hashing_protocols import ContentIdentifiable, DataContextAware + + +class OrcapodObject( + DataContextAware, ContentIdentifiable, Labelable, Temporal, Protocol +): + pass diff --git a/src/orcapod/protocols/core_protocols/packet_function.py b/src/orcapod/protocols/core_protocols/packet_function.py index 059c629..62878cc 100644 --- a/src/orcapod/protocols/core_protocols/packet_function.py +++ b/src/orcapod/protocols/core_protocols/packet_function.py @@ -2,8 +2,8 @@ from orcapod.protocols.core_protocols.datagrams import Packet from orcapod.protocols.core_protocols.labelable import Labelable -from orcapod.types import PythonSchema from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.types import PythonSchema @runtime_checkable diff --git a/src/orcapod/protocols/core_protocols/pod.py b/src/orcapod/protocols/core_protocols/pod.py index 6b98790..e08434b 100644 --- a/src/orcapod/protocols/core_protocols/pod.py +++ b/src/orcapod/protocols/core_protocols/pod.py @@ -1,12 +1,10 @@ from collections.abc import Collection from typing import Any, Protocol, TypeAlias, runtime_checkable +from orcapod.protocols.core_protocols.datagrams import ColumnConfig +from orcapod.protocols.core_protocols.orcapod_object import OrcapodObject from orcapod.protocols.core_protocols.packet_function import PacketFunction -from orcapod.protocols.core_protocols.datagrams import ColumnConfig, Tag, Packet -from orcapod.protocols.core_protocols.labelable import Labelable from orcapod.protocols.core_protocols.streams import Stream -from orcapod.protocols.core_protocols.temporal import Temporal -from orcapod.protocols.hashing_protocols import ContentIdentifiable, DataContextAware from orcapod.types import PythonSchema # Core recursive types @@ -17,7 +15,7 @@ @runtime_checkable -class Pod(DataContextAware, ContentIdentifiable, Labelable, Temporal, Protocol): +class Pod(OrcapodObject, Protocol): """ The fundamental unit of computation in Orcapod. diff --git a/src/orcapod/protocols/core_protocols/streams.py b/src/orcapod/protocols/core_protocols/streams.py index 85b490c..b395fcd 100644 --- a/src/orcapod/protocols/core_protocols/streams.py +++ b/src/orcapod/protocols/core_protocols/streams.py @@ -2,9 +2,7 @@ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable from orcapod.protocols.core_protocols.datagrams import ColumnConfig, Packet, Tag -from orcapod.protocols.core_protocols.labelable import Labelable -from orcapod.protocols.core_protocols.temporal import Temporal -from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.protocols.core_protocols.orcapod_object import OrcapodObject from orcapod.types import PythonSchema if TYPE_CHECKING: @@ -16,7 +14,7 @@ @runtime_checkable -class Stream(ContentIdentifiable, Labelable, Temporal, Protocol): +class Stream(OrcapodObject, Protocol): """ Base protocol for all streams in Orcapod. diff --git a/src/orcapod/core/arrow_data_utils.py b/src/orcapod/utils/arrow_data_utils.py similarity index 100% rename from src/orcapod/core/arrow_data_utils.py rename to src/orcapod/utils/arrow_data_utils.py diff --git a/src/orcapod/core/polars_data_utils.py b/src/orcapod/utils/polars_data_utils.py similarity index 100% rename from src/orcapod/core/polars_data_utils.py rename to src/orcapod/utils/polars_data_utils.py From 03f9dcc3e34eb8de876e21373d0297fedb691613 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 2 Dec 2025 18:42:21 +0000 Subject: [PATCH 10/11] feature: cache packet output schema hash, use logging --- src/orcapod/core/operators/join.py | 3 +-- src/orcapod/core/packet_function.py | 27 +++++++++++-------- .../basic_delta_lake_arrow_database.py | 3 ++- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py index f9cf39f..22994f7 100644 --- a/src/orcapod/core/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -1,13 +1,12 @@ from collections.abc import Collection from typing import TYPE_CHECKING, Any -from orcapod.core import arrow_data_utils from orcapod.core.operators.base import NonZeroInputOperator from orcapod.core.streams import TableStream from orcapod.errors import InputValidationError from orcapod.protocols.core_protocols import ArgumentGroup, ColumnConfig, Stream from orcapod.types import PythonSchema -from orcapod.utils import schema_utils +from orcapod.utils import arrow_data_utils, schema_utils from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: diff --git a/src/orcapod/core/packet_function.py b/src/orcapod/core/packet_function.py index b0bed05..2ff1be1 100644 --- a/src/orcapod/core/packet_function.py +++ b/src/orcapod/core/packet_function.py @@ -111,14 +111,17 @@ def __init__( f"Version string {version} does not contain a valid version number" ) + # compute and store hash for output_packet_schema + self._output_packet_schema_hash = self.data_context.object_hasher.hash_object( + self.output_packet_schema + ).to_string() + @property def uri(self) -> tuple[str, ...]: # TODO: make this more efficient return ( - f"{self.canonical_function_name}", - self.data_context.object_hasher.hash_object( - self.output_packet_schema - ).to_string(), + self.canonical_function_name, + self._output_packet_schema_hash, f"v{self.major_version}", self.packet_function_type_id, ) @@ -450,11 +453,11 @@ def call( # execution_engine_hash = execution_engine.name if execution_engine else "default" output_packet = None if not skip_cache_lookup: - print("Checking for cache...") + logger.info("Checking for cache...") # lookup stored result for the input packet output_packet = self.get_cached_output_for_packet(packet) if output_packet is not None: - print(f"Cache hit for {packet}!") + logger.info(f"Cache hit for {packet}!") if output_packet is None: output_packet = self._packet_function.call(packet) if output_packet is not None: @@ -525,22 +528,24 @@ def record_packet( # TODO: consider incorporating execution_engine_opts into the record data_table = output_packet.as_table(columns={"source": True, "context": True}) - i = -1 - for i, (k, v) in enumerate(self.get_function_variation_data().items()): + i = 0 + for k, v in self.get_function_variation_data().items(): # add the tiered pod ID to the data table data_table = data_table.add_column( i, f"{constants.PF_VARIATION_PREFIX}{k}", pa.array([v], type=pa.large_string()), ) + i += 1 - for j, (k, v) in enumerate(self.get_execution_data().items()): + for k, v in self.get_execution_data().items(): # add the tiered pod ID to the data table data_table = data_table.add_column( - i + j + 1, + i, f"{constants.PF_EXECUTION_PREFIX}{k}", pa.array([v], type=pa.large_string()), ) + i += 1 # add the input packet hash as a column data_table = data_table.add_column( @@ -558,7 +563,7 @@ def record_packet( self._result_database.add_record( self.record_path, - output_packet.datagram_id, + output_packet.datagram_id, # output packet datagram ID (uuid) is used as a unique identification data_table, skip_duplicates=skip_duplicates, ) diff --git a/src/orcapod/databases/basic_delta_lake_arrow_database.py b/src/orcapod/databases/basic_delta_lake_arrow_database.py index 9781b2b..39334e2 100644 --- a/src/orcapod/databases/basic_delta_lake_arrow_database.py +++ b/src/orcapod/databases/basic_delta_lake_arrow_database.py @@ -6,7 +6,7 @@ from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError -from orcapod.core import constants +from orcapod.system_constants import constants from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: @@ -1002,6 +1002,7 @@ def get_table_info(self, record_path: tuple[str, ...]) -> dict[str, Any] | None: "pending_records": pending_count, } + # FIXME: handle more specific exception only except Exception as e: logger.error(f"Error getting table info for {'/'.join(record_path)}: {e}") return None From 8d4fbbbca38cdf740cf63708cc6784d5c0dc3e47 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 2 Dec 2025 21:11:03 +0000 Subject: [PATCH 11/11] refactor: Lazily compute packet function output schema hash and explicitly define function pod constructor parameters. --- src/orcapod/core/function_pod.py | 49 +++++++++++++++++++++++------ src/orcapod/core/packet_function.py | 18 +++++++---- 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/src/orcapod/core/function_pod.py b/src/orcapod/core/function_pod.py index 9f90947..f07f6b4 100644 --- a/src/orcapod/core/function_pod.py +++ b/src/orcapod/core/function_pod.py @@ -22,6 +22,7 @@ ) from orcapod.protocols.database_protocols import ArrowDatabase from orcapod.system_constants import constants +from orcapod.config import Config from orcapod.types import PythonSchema from orcapod.utils import arrow_utils, schema_utils from orcapod.utils.lazy_module import LazyModule @@ -41,9 +42,15 @@ def __init__( self, packet_function: PacketFunction, tracker_manager: TrackerManager | None = None, - **kwargs, + label: str | None = None, + data_context: str | contexts.DataContext | None = None, + orcapod_config: Config | None = None, ) -> None: - super().__init__(**kwargs) + super().__init__( + label=label, + data_context=data_context, + orcapod_config=orcapod_config, + ) self.tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self._packet_function = packet_function self._output_schema_hash = self.data_context.object_hasher.hash_object( @@ -472,8 +479,8 @@ def process(self, *streams: Stream, label: str | None = None) -> FunctionPodStre class FunctionPodNode(OrcapodBase): """ - A pod that caches the results of the wrapped pod. - This is useful for pods that are expensive to compute and can benefit from caching. + A pod that caches the results of the wrapped packet function. + This is useful for packet functions that are expensive to compute and can benefit from caching. """ def __init__( @@ -484,7 +491,9 @@ def __init__( result_database: ArrowDatabase | None = None, pipeline_path_prefix: tuple[str, ...] = (), tracker_manager: TrackerManager | None = None, - **kwargs, + label: str | None = None, + data_context: str | contexts.DataContext | None = None, + orcapod_config: Config | None = None, ): if tracker_manager is None: tracker_manager = DEFAULT_TRACKER_MANAGER @@ -502,7 +511,22 @@ def __init__( ) # initialize the base FunctionPod with the cached packet function - super().__init__(**kwargs) + super().__init__( + label=label, + data_context=data_context, + orcapod_config=orcapod_config, + ) + + # validate the input stream + _, incoming_packet_types = input_stream.output_schema() + expected_packet_schema = packet_function.input_packet_schema + if not schema_utils.check_typespec_compatibility( + incoming_packet_types, expected_packet_schema + ): + # TODO: use custom exception type for better error handling + raise ValueError( + f"Incoming packet data type {incoming_packet_types} from {input_stream} is not compatible with expected input typespec {expected_packet_schema}" + ) self._input_stream = input_stream @@ -523,6 +547,8 @@ def __init__( ).to_string() def identity_structure(self) -> Any: + # Identity of function pod node is the identity of the + # (cached) packet function + input stream return (self._cached_packet_function, self._input_stream) @property @@ -598,7 +624,9 @@ def process( logger.debug(f"Invoking kernel {self} on streams: {streams}") # perform input stream validation - self.validate_inputs(self._input_stream) + self.validate_inputs(*streams) + # TODO: add logic to handle/modify input stream based on streams passed in + # Example includes appling semi_join on the input stream based on the streams passed in self.tracker_manager.record_packet_function_invocation( self._cached_packet_function, self._input_stream, label=label ) @@ -669,6 +697,9 @@ def add_pipeline_record( if existing_record is not None: # if the record already exists, then skip adding + logger.debug( + f"Record with entry_id {entry_id} already exists. Skipping addition." + ) return # rename all keys to avoid potential collision with result columns @@ -678,11 +709,11 @@ def add_pipeline_record( input_packet_info = ( renamed_input_packet.as_table(columns={"source": True}) .append_column( - constants.PACKET_RECORD_ID, + constants.PACKET_RECORD_ID, # record ID for the packet function output packet pa.array([packet_record_id], type=pa.large_string()), ) .append_column( - f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", # data context key for the input packet pa.array([input_packet.data_context_key], type=pa.large_string()), ) .append_column( diff --git a/src/orcapod/core/packet_function.py b/src/orcapod/core/packet_function.py index 2ff1be1..583d8a6 100644 --- a/src/orcapod/core/packet_function.py +++ b/src/orcapod/core/packet_function.py @@ -111,17 +111,23 @@ def __init__( f"Version string {version} does not contain a valid version number" ) - # compute and store hash for output_packet_schema - self._output_packet_schema_hash = self.data_context.object_hasher.hash_object( - self.output_packet_schema - ).to_string() + self._output_packet_schema_hash = None + + @property + def output_packet_schema_hash(self) -> str | None: + if self._output_packet_schema_hash is None: + self._output_packet_schema_hash = ( + self.data_context.object_hasher.hash_object( + self.output_packet_schema + ).to_string() + ) + return self._output_packet_schema_hash @property def uri(self) -> tuple[str, ...]: - # TODO: make this more efficient return ( self.canonical_function_name, - self._output_packet_schema_hash, + self.output_packet_schema_hash, f"v{self.major_version}", self.packet_function_type_id, )