Skip to content

Commit e7ff695

Browse files
authored
Merge pull request #42 from tower/feature/tow-382-add-new-operations-to-the-table-interface
Add additional methods to the Table function
2 parents 76d7631 + 4796836 commit e7ff695

10 files changed

Lines changed: 637 additions & 31 deletions

File tree

.github/workflows/test-python.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,21 @@ jobs:
2323
strategy:
2424
fail-fast: false
2525
matrix:
26-
os: [ubuntu-latest, windows-latest]
26+
os: ubuntu-latest
2727

2828
steps:
2929
- uses: actions/checkout@v4
3030

31-
- name: Set up Python
32-
uses: actions/setup-python@v5
33-
3431
- name: Install the latest version of uv
3532
uses: astral-sh/setup-uv@v6
3633

37-
- name: Install dependencies
38-
if: github.ref_name != 'main'
39-
run: uv sync --all-extras
34+
- name: "Set up Python"
35+
uses: actions/setup-python@v5
36+
with:
37+
python-version-file: ".python-version"
38+
39+
- name: Install the project
40+
run: uv sync --locked --all-extras --dev
4041

4142
- name: Run tests
42-
if: github.ref_name != 'main'
43-
run: uv run -m pytest --tb=short --disable-warnings
43+
run: uv run pytest tests

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

pyproject.toml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,6 @@ dependencies = [
4545
ai = ["huggingface-hub>=0.30.2", "ollama>=0.4.7"]
4646
iceberg = ["polars>=1.27.1", "pyarrow>=19.0.1", "pyiceberg>=0.9.0"]
4747
all = ["tower[ai,iceberg]"]
48-
dev = [
49-
"openapi-python-client>=0.12.1",
50-
"pytest>=8.3.5",
51-
"pytest-httpx>=0.35.0",
52-
]
5348

5449
[tool.maturin]
5550
bindings = "bin"
@@ -62,3 +57,11 @@ include = ["rust-toolchain.toml"]
6257

6358
[tool.uv.sources]
6459
tower = { workspace = true }
60+
61+
[dependency-groups]
62+
dev = [
63+
"openapi-python-client>=0.12.1",
64+
"pytest>=8.3.5",
65+
"pytest-httpx>=0.35.0",
66+
"pyiceberg[sql-sqlite]>=0.9.0",
67+
]

src/tower/_tables.py

Lines changed: 139 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,46 @@
1-
from typing import Optional
1+
from typing import Optional, Generic, TypeVar, Union, List
2+
from dataclasses import dataclass
3+
4+
TTable = TypeVar("TTable", bound="Table")
25

36
import polars as pl
47
import pyarrow as pa
8+
import pyarrow.compute as pc
59

6-
from pyiceberg.catalog import load_catalog
710
from pyiceberg.table import Table as IcebergTable
11+
from pyiceberg.catalog import (
12+
Catalog,
13+
load_catalog,
14+
)
815

916
from ._context import TowerContext
10-
from .utils.pyarrow import convert_pyarrow_schema
17+
from .utils.pyarrow import (
18+
convert_pyarrow_schema,
19+
convert_pyarrow_expressions,
20+
)
1121
from .utils.tables import (
1222
make_table_name,
1323
namespace_or_default,
1424
)
1525

26+
@dataclass
27+
class RowsAffectedInformation:
28+
inserts: int
29+
updates: int
30+
31+
1632
class Table:
1733
"""
1834
`Table` is a wrapper around an Iceberg table. It provides methods to read and
1935
write data to the table.
2036
"""
2137

2238
def __init__(self, context: TowerContext, table: IcebergTable):
39+
self._stats = RowsAffectedInformation(0, 0)
2340
self._context = context
2441
self._table = table
2542

43+
2644
def read(self) -> pl.DataFrame:
2745
"""
2846
Reads from the Iceberg tables. Returns the results as a Polars DataFrame.
@@ -31,28 +49,135 @@ def read(self) -> pl.DataFrame:
3149
# the result as a DataFrame.
3250
return pl.scan_iceberg(self._table).collect()
3351

34-
def insert(self, data: pa.Table):
52+
53+
def to_polars(self) -> pl.LazyFrame:
54+
"""
55+
Converts the table to a Polars LazyFrame. This is useful when you
56+
understand Polars and you want to do something more complicated.
57+
"""
58+
return pl.scan_iceberg(self._table)
59+
60+
61+
def rows_affected(self) -> RowsAffectedInformation:
62+
"""
63+
Returns the stats for the table. This includes the number of inserts,
64+
updates, and deletes.
65+
"""
66+
return self._stats
67+
68+
69+
def insert(self, data: pa.Table) -> TTable:
3570
"""
3671
Inserts data into the Iceberg table. The data is expressed as a PyArrow table.
3772
3873
Args:
3974
data (pa.Table): The data to insert into the table.
75+
76+
Returns:
77+
TTable: The table with the inserted rows.
4078
"""
4179
self._table.append(data)
80+
self._stats.inserts += data.num_rows
81+
return self
82+
83+
84+
def upsert(self, data: pa.Table, join_cols: Optional[list[str]] = None) -> TTable:
85+
"""
86+
Upserts data into the Iceberg table. The data is expressed as a PyArrow table.
87+
88+
Args:
89+
data (pa.Table): The data to upsert into the table.
90+
join_cols (Optional[list[str]]): The columns that form the key to match rows on
91+
92+
Returns:
93+
TTable: The table with the upserted rows.
94+
"""
95+
res = self._table.upsert(
96+
data,
97+
join_cols=join_cols,
98+
99+
# All upserts will always be case sensitive. Perhaps we'll add this
100+
# as a parameter in the future?
101+
case_sensitive=True,
102+
103+
# These are the defaults, but we're including them to be complete.
104+
when_matched_update_all=True,
105+
when_not_matched_insert_all=True,
106+
)
107+
108+
# Update the stats with the results of the relevant upsert.
109+
self._stats.updates += res.rows_updated
110+
self._stats.inserts += res.rows_inserted
111+
112+
return self
113+
114+
115+
def delete(self, filters: Union[str, List[pc.Expression]]) -> TTable:
116+
"""
117+
Deletes data from the Iceberg table. The filters are expressed as a
118+
PyArrow expression. The filters are applied to the table and the
119+
matching rows are deleted.
120+
121+
Args:
122+
filters (Union[str, List[pc.Expression]]): The filters to apply to the table.
123+
This can be a string or a list of PyArrow expressions.
124+
125+
Returns:
126+
TTable: The table with the deleted rows.
127+
"""
128+
if isinstance(filters, list):
129+
# We need to convert the pc.Expression into PyIceberg
130+
next_filters = convert_pyarrow_expressions(filters)
131+
filters = next_filters
132+
133+
self._table.delete(
134+
delete_filter=filters,
135+
136+
# We want this to always be the case. Not sure why you wouldn't?
137+
case_sensitive=True,
138+
)
139+
140+
# NOTE: There is, unfortunately, no way to get the number of rows
141+
# deleted besides comparing the two snapshots that were created.
142+
143+
return self
144+
145+
146+
def schema(self) -> pa.Schema:
147+
# We take an Iceberg Schema and we need to convert it into a PyArrow Schema
148+
iceberg_schema = self._table.schema()
149+
return iceberg_schema.as_arrow()
150+
151+
152+
def column(self, name: str) -> pa.compute.Expression:
153+
"""
154+
Returns a column from the table. This is useful when you want to
155+
perform some operations on the column.
156+
"""
157+
field = self.schema().field(name)
158+
159+
if field is None:
160+
raise ValueError(f"Column {name} not found in table schema")
161+
162+
# We need to convert the PyArrow field into pa.compute.Expression
163+
return pa.compute.field(name)
164+
42165

43166
class TableReference:
44-
def __init__(self, ctx: TowerContext, catalog_name: str, name: str, namespace: Optional[str] = None):
167+
def __init__(self, ctx: TowerContext, catalog: Catalog, name: str, namespace: Optional[str] = None):
45168
self._context = ctx
46-
self._catalog = load_catalog(catalog_name)
169+
self._catalog = catalog
47170
self._name = name
48171
self._namespace = namespace
49172

173+
50174
def load(self) -> Table:
51175
namespace = namespace_or_default(self._namespace)
52176
table_name = make_table_name(self._name, namespace)
53177
table = self._catalog.load_table(table_name)
54178
return Table(self._context, table)
55179

180+
56181
def create(self, schema: pa.Schema) -> Table:
57182
namespace = namespace_or_default(self._namespace)
58183
table_name = make_table_name(self._name, namespace)
@@ -71,6 +196,7 @@ def create(self, schema: pa.Schema) -> Table:
71196

72197
return Table(self._context, table)
73198

199+
74200
def create_if_not_exists(self, schema: pa.Schema) -> Table:
75201
namespace = namespace_or_default(self._namespace)
76202
table_name = make_table_name(self._name, namespace)
@@ -92,7 +218,7 @@ def create_if_not_exists(self, schema: pa.Schema) -> Table:
92218

93219
def tables(
94220
name: str,
95-
catalog: str = "default",
221+
catalog: Union[str, Catalog] = "default",
96222
namespace: Optional[str] = None
97223
) -> TableReference:
98224
"""
@@ -101,11 +227,16 @@ def tables(
101227
102228
Args:
103229
`name` (str): The name of the table to load.
104-
`catalog` (str): The name of the catalog to use. "default" by default.
230+
`catalog` (Union[str, Catalog]): The name of the catalog or the actual
231+
catalog to use. "default" is the default value. You can pass in an
232+
actual catalog object for testing purposes.
105233
`namespace` (Optional[str]): The namespace in which to load the table.
106234
107235
Returns:
108236
TableReference: A reference to a table to be resolved with `create` or `load`
109237
"""
238+
if isinstance(catalog, str):
239+
catalog = load_catalog(catalog)
240+
110241
ctx = TowerContext.build()
111242
return TableReference(ctx, catalog, name, namespace)

src/tower/polars.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
try:
2+
import polars as _polars
3+
# Re-export everything from polars
4+
from polars import *
5+
6+
# Or if you prefer, you can be explicit about what you re-export
7+
# from polars import DataFrame, Series, etc.
8+
except ImportError:
9+
_polars = None
10+
# Set specific names to None if you're using explicit imports

src/tower/pyarrow.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
try:
2+
import pyarrow as _pyarrow
3+
# Re-export everything
4+
from pyarrow import *
5+
except ImportError:
6+
_pyarrow = None

src/tower/pyiceberg.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
try:
2+
import pyiceberg as _pyiceberg
3+
# Re-export everything
4+
from pyiceberg import *
5+
except ImportError:
6+
_pyiceberg = None
7+
8+
9+
# Dynamic dispatch for submodules, as relevant.
10+
def __getattr__(name):
11+
"""Forward attribute access to the original module."""
12+
return getattr(_pyiceberg, name)
13+
14+
# Optionally, also set up the module to handle subpackage imports
15+
# This requires Python 3.7+
16+
def __dir__():
17+
return dir(_pyiceberg)

0 commit comments

Comments
 (0)