Skip to content

Commit d69f359

Browse files
committed
add example notebooks
1 parent 375085b commit d69f359

2 files changed

Lines changed: 488 additions & 0 deletions

File tree

notebooks/pyiceberg_example.ipynb

Lines changed: 364 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,364 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "c6cc20c0",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"# Import required libraries\n",
11+
"import pyiceberg\n",
12+
"from pyiceberg.catalog import load_catalog\n",
13+
"from pyiceberg.table import Table\n",
14+
"import pandas as pd\n",
15+
"import pyarrow as pa\n",
16+
"print(f\"PyIceberg version: {pyiceberg.__version__}\")"
17+
]
18+
},
19+
{
20+
"cell_type": "markdown",
21+
"id": "33dcfca9",
22+
"metadata": {},
23+
"source": [
24+
"## Setup: Connecting to a Catalog\n",
25+
"\n",
26+
"Iceberg uses a catalog to organize tables. For this example, we'll use a `SqlCatalog` with SQLite for local testing."
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": null,
32+
"id": "649053ed",
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"# Import required libraries\n",
37+
"from pyiceberg.catalog import load_catalog\n",
38+
"import pyarrow.parquet as pq\n",
39+
"import pyarrow.compute as pc\n",
40+
"import tempfile\n",
41+
"import os"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": null,
47+
"id": "72d5da19",
48+
"metadata": {},
49+
"outputs": [],
50+
"source": [
51+
"# Create a temporary warehouse location\n",
52+
"warehouse_path = tempfile.mkdtemp(prefix=\"iceberg_warehouse_\")\n",
53+
"print(f\"Warehouse location: {warehouse_path}\")"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": null,
59+
"id": "14e9429c",
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"# Configure and load the catalog\n",
64+
"catalog = load_catalog(\n",
65+
" \"default\",\n",
66+
" **{\n",
67+
" 'type': 'sql',\n",
68+
" \"uri\": f\"sqlite:///{warehouse_path}/pyiceberg_catalog.db\",\n",
69+
" \"warehouse\": f\"file://{warehouse_path}\",\n",
70+
" },\n",
71+
")\n",
72+
"\n",
73+
"print(\"Catalog loaded successfully!\")\n",
74+
"print(f\"Namespaces: {list(catalog.list_namespaces())}\")"
75+
]
76+
},
77+
{
78+
"cell_type": "markdown",
79+
"id": "e330d377",
80+
"metadata": {},
81+
"source": [
82+
"## Create a Namespace and Table\n",
83+
"\n",
84+
"Let's create a namespace and a simple Iceberg table."
85+
]
86+
},
87+
{
88+
"cell_type": "code",
89+
"execution_count": null,
90+
"id": "90312e03",
91+
"metadata": {},
92+
"outputs": [],
93+
"source": [
94+
"# Create a namespace\n",
95+
"catalog.create_namespace(\"default\")\n",
96+
"print(f\"Available namespaces: {list(catalog.list_namespaces())}\")"
97+
]
98+
},
99+
{
100+
"cell_type": "markdown",
101+
"id": "f96438ef",
102+
"metadata": {},
103+
"source": [
104+
"## Write Data to an Iceberg Table\n",
105+
"\n",
106+
"We'll create a sample dataset and write it to an Iceberg table."
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": null,
112+
"id": "2ef11eb9",
113+
"metadata": {},
114+
"outputs": [],
115+
"source": [
116+
"# Create sample data using PyArrow\n",
117+
"import pyarrow as pa\n",
118+
"\n",
119+
"# Sample taxi-like data\n",
120+
"data = {\n",
121+
" 'vendor_id': [1, 2, 1, 2, 1],\n",
122+
" 'trip_distance': [1.5, 2.3, 0.8, 5.2, 3.1],\n",
123+
" 'fare_amount': [10.0, 15.5, 6.0, 22.0, 18.0],\n",
124+
" 'tip_amount': [2.0, 3.0, 1.0, 4.5, 3.5],\n",
125+
" 'passenger_count': [1, 2, 1, 3, 2]\n",
126+
"}\n",
127+
"\n",
128+
"df = pa.table(data)\n",
129+
"print(\"Sample data:\")\n",
130+
"print(df)"
131+
]
132+
},
133+
{
134+
"cell_type": "code",
135+
"execution_count": null,
136+
"id": "678d122a",
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"# Create an Iceberg table with the schema from our dataframe\n",
141+
"table = catalog.create_table(\n",
142+
" \"default.sample_trips\",\n",
143+
" schema=df.schema,\n",
144+
")\n",
145+
"\n",
146+
"print(f\"Created table: {table}\")\n",
147+
"print(f\"Table schema: {table.schema()}\")"
148+
]
149+
},
150+
{
151+
"cell_type": "code",
152+
"execution_count": null,
153+
"id": "8e135b2a",
154+
"metadata": {},
155+
"outputs": [],
156+
"source": [
157+
"# Append data to the table\n",
158+
"table.append(df)\n",
159+
"print(f\"Rows written: {len(table.scan().to_arrow())}\")"
160+
]
161+
},
162+
{
163+
"cell_type": "markdown",
164+
"id": "0ef43fbf",
165+
"metadata": {},
166+
"source": [
167+
"## Read Data from the Table\n",
168+
"\n",
169+
"Let's read back the data we just wrote."
170+
]
171+
},
172+
{
173+
"cell_type": "code",
174+
"execution_count": null,
175+
"id": "d1ef0396",
176+
"metadata": {},
177+
"outputs": [],
178+
"source": [
179+
"# Scan and read the entire table\n",
180+
"result = table.scan().to_arrow()\n",
181+
"print(\"Table contents:\")\n",
182+
"print(result)"
183+
]
184+
},
185+
{
186+
"cell_type": "markdown",
187+
"id": "a8a3c906",
188+
"metadata": {},
189+
"source": [
190+
"## Schema Evolution\n",
191+
"\n",
192+
"One of Iceberg's powerful features is schema evolution. Let's add a new computed column."
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": null,
198+
"id": "8725f3ce",
199+
"metadata": {},
200+
"outputs": [],
201+
"source": [
202+
"# Add a new computed column: tip per mile\n",
203+
"df = df.append_column(\"tip_per_mile\", pc.divide(df[\"tip_amount\"], df[\"trip_distance\"]))\n",
204+
"print(\"Updated dataframe with new column:\")\n",
205+
"print(df)"
206+
]
207+
},
208+
{
209+
"cell_type": "code",
210+
"execution_count": null,
211+
"id": "e0c4550b",
212+
"metadata": {},
213+
"outputs": [],
214+
"source": [
215+
"# Evolve the table schema to include the new column\n",
216+
"with table.update_schema() as update_schema:\n",
217+
" update_schema.union_by_name(df.schema)\n",
218+
"\n",
219+
"print(\"Schema evolved!\")\n",
220+
"print(f\"Updated table schema: {table.schema()}\")"
221+
]
222+
},
223+
{
224+
"cell_type": "code",
225+
"execution_count": null,
226+
"id": "2a65eee4",
227+
"metadata": {},
228+
"outputs": [],
229+
"source": [
230+
"# Overwrite the table with the new data\n",
231+
"table.overwrite(df)\n",
232+
"print(\"Data overwritten with new schema\")\n",
233+
"\n",
234+
"# Verify the new column exists\n",
235+
"result = table.scan().to_arrow()\n",
236+
"print(result)"
237+
]
238+
},
239+
{
240+
"cell_type": "markdown",
241+
"id": "7140ba0c",
242+
"metadata": {},
243+
"source": [
244+
"## Filtering Data\n",
245+
"\n",
246+
"PyIceberg supports predicate pushdown for efficient data filtering."
247+
]
248+
},
249+
{
250+
"cell_type": "code",
251+
"execution_count": null,
252+
"id": "3af0f0b1",
253+
"metadata": {},
254+
"outputs": [],
255+
"source": [
256+
"# Filter rows where tip_per_mile > 1.0\n",
257+
"filtered_df = table.scan(row_filter=\"tip_per_mile > 1.0\").to_arrow()\n",
258+
"print(f\"Rows with tip_per_mile > 1.0: {len(filtered_df)}\")\n",
259+
"print(filtered_df)"
260+
]
261+
},
262+
{
263+
"cell_type": "markdown",
264+
"id": "ff173f80",
265+
"metadata": {},
266+
"source": [
267+
"## Inspect Table Metadata\n",
268+
"\n",
269+
"Iceberg tables maintain rich metadata about their structure and history."
270+
]
271+
},
272+
{
273+
"cell_type": "code",
274+
"execution_count": null,
275+
"id": "e3763e27",
276+
"metadata": {},
277+
"outputs": [],
278+
"source": [
279+
"# View table properties\n",
280+
"print(f\"Table location: {table.location()}\")\n",
281+
"print(f\"Table properties: {table.properties}\")\n",
282+
"print(f\"Current snapshot ID: {table.current_snapshot()}\")"
283+
]
284+
},
285+
{
286+
"cell_type": "code",
287+
"execution_count": null,
288+
"id": "49154477",
289+
"metadata": {},
290+
"outputs": [],
291+
"source": [
292+
"# View table history (snapshots)\n",
293+
"print(\"Table history:\")\n",
294+
"for snapshot in table.history():\n",
295+
" print(f\" Snapshot: {snapshot}\")"
296+
]
297+
},
298+
{
299+
"cell_type": "markdown",
300+
"id": "448a1962",
301+
"metadata": {},
302+
"source": [
303+
"## Explore Data Files\n",
304+
"\n",
305+
"Let's see what files Iceberg created in the warehouse."
306+
]
307+
},
308+
{
309+
"cell_type": "code",
310+
"execution_count": null,
311+
"id": "3c8948e5",
312+
"metadata": {},
313+
"outputs": [],
314+
"source": [
315+
"# List all files in the warehouse\n",
316+
"import os\n",
317+
"for root, dirs, files in os.walk(warehouse_path):\n",
318+
" level = root.replace(warehouse_path, '').count(os.sep)\n",
319+
" indent = ' ' * 2 * level\n",
320+
" print(f'{indent}{os.path.basename(root)}/')\n",
321+
" subindent = ' ' * 2 * (level + 1)\n",
322+
" for file in files:\n",
323+
" print(f'{subindent}{file}')"
324+
]
325+
},
326+
{
327+
"cell_type": "markdown",
328+
"id": "e9db29ad",
329+
"metadata": {},
330+
"source": [
331+
"## Additional Operations\n",
332+
"\n",
333+
"PyIceberg supports many more operations including:\n",
334+
"- Time travel queries\n",
335+
"- Partition evolution\n",
336+
"- Table maintenance (expire snapshots, rewrite data files)\n",
337+
"- Integration with pandas, DuckDB, Ray, and more\n",
338+
"\n",
339+
"Check the [PyIceberg documentation](https://py.iceberg.apache.org/) for more details!"
340+
]
341+
}
342+
],
343+
"metadata": {
344+
"kernelspec": {
345+
"display_name": "Python 3 (ipykernel)",
346+
"language": "python",
347+
"name": "python3"
348+
},
349+
"language_info": {
350+
"codemirror_mode": {
351+
"name": "ipython",
352+
"version": 3
353+
},
354+
"file_extension": ".py",
355+
"mimetype": "text/x-python",
356+
"name": "python",
357+
"nbconvert_exporter": "python",
358+
"pygments_lexer": "ipython3",
359+
"version": "3.12.9"
360+
}
361+
},
362+
"nbformat": 4,
363+
"nbformat_minor": 5
364+
}

0 commit comments

Comments
 (0)