Skip to content

Commit 133dfd9

Browse files
committed
Merge branch 'main' into dl_branch
Resolve conflict in integration_tests.py: accept main's streamlined test structure and add branch integration tests (create branch_a at snap1, branch_b at snap2; verify data isolation, snapshot resolution, and non-existent branch error).
1 parent 4ad1346 commit 133dfd9

18 files changed

Lines changed: 659 additions & 587 deletions

File tree

.github/workflows/build-run-tests.yml

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,6 @@ jobs:
3434
with:
3535
python-version: '3.12'
3636

37-
- name: Install uv
38-
uses: astral-sh/setup-uv@v7
39-
with:
40-
enable-cache: true
41-
42-
- name: Run Data Loader Tests
43-
working-directory: integrations/python/dataloader
44-
run: make sync verify
45-
46-
- name: Run Data Loader Integration Tests
47-
working-directory: integrations/python/dataloader
48-
run: make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
49-
5037
- name: Install dependencies
5138
run: pip install -r scripts/python/requirements.txt
5239

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
name: Dataloader Tests
2+
3+
on:
4+
pull_request:
5+
types: [opened, synchronize, reopened]
6+
branches:
7+
- main
8+
paths:
9+
- 'integrations/python/dataloader/**'
10+
11+
concurrency:
12+
group: dataloader-${{ github.event.pull_request.number || github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
test:
17+
name: Dataloader Build and Test
18+
runs-on: ubuntu-latest
19+
steps:
20+
- name: Checkout project sources
21+
uses: actions/checkout@v6
22+
23+
- name: Set up Python
24+
uses: actions/setup-python@v6
25+
with:
26+
python-version: '3.12'
27+
28+
- name: Install uv
29+
uses: astral-sh/setup-uv@v7
30+
with:
31+
enable-cache: true
32+
33+
- name: Run Data Loader Tests
34+
working-directory: integrations/python/dataloader
35+
run: make sync verify
36+
37+
- name: Set up JDK 17
38+
uses: actions/setup-java@v5
39+
with:
40+
distribution: 'microsoft'
41+
java-version: '17'
42+
43+
- name: Setup Gradle
44+
uses: gradle/actions/setup-gradle@v5
45+
46+
- name: Build project and Spark uber JARs
47+
run: ./gradlew clean build shadowJar -x test
48+
49+
- name: Start Docker Containers
50+
run: docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml up -d --build
51+
52+
- name: Wait for Docker Containers to be ready
53+
run: |
54+
echo "Waiting for openhouse-tables and spark-livy..."
55+
for i in $(seq 1 120); do
56+
tables_status=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/v1/databases 2>&1 || echo "000")
57+
livy_status=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:9003/sessions 2>&1 || echo "000")
58+
echo "Poll $i: tables=$tables_status livy=$livy_status"
59+
if [ "$tables_status" != "000" ] && [ "$livy_status" != "000" ]; then
60+
echo "Services ready after $((i * 5))s"
61+
exit 0
62+
fi
63+
sleep 5
64+
done
65+
echo "Timed out waiting for services after 600s"
66+
docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml ps
67+
exit 1
68+
69+
- name: Run Data Loader Integration Tests
70+
working-directory: integrations/python/dataloader
71+
run: make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
72+
73+
- name: Stop Docker Containers
74+
if: always()
75+
run: docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml down

apps/spark-3.5/src/test/java/com/linkedin/openhouse/catalog/e2e/SparkMoRFunctionalTest.java

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.iceberg.io.OutputFileFactory;
2727
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
2828
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
29+
import org.apache.iceberg.spark.actions.SparkActions;
2930
import org.apache.iceberg.util.ArrayUtil;
3031
import org.apache.spark.sql.Encoders;
3132
import org.apache.spark.sql.Row;
@@ -195,6 +196,92 @@ public void testCompactionCanRemoveEqualityDeleteFiles() throws NoSuchTableExcep
195196
assertThat(stats.getNumCurrentSnapshotEqualityDeleteFiles()).isEqualTo(0L);
196197
}
197198

199+
@Test
200+
public void testBudgetedRewriteUsesDataLengthForTaskGrouping() throws NoSuchTableException {
201+
createAndInitTable("id int, data string");
202+
203+
// Create 4 separate data files by appending individually
204+
for (int i = 0; i < 4; i++) {
205+
List<SimpleRecord> records =
206+
Arrays.asList(
207+
new SimpleRecord(i * 2, "data_" + i), new SimpleRecord(i * 2 + 1, "data_" + i));
208+
ops.spark()
209+
.createDataset(records, Encoders.bean(SimpleRecord.class))
210+
.coalesce(1)
211+
.writeTo(tableName)
212+
.append();
213+
}
214+
215+
// Delete one row from each data file to produce partition-scoped position delete files.
216+
// In an unpartitioned table, all position deletes are in the same partition and thus
217+
// associated with ALL data files, inflating each task's sizeBytes relative to its length.
218+
for (int i = 0; i < 4; i++) {
219+
sql("DELETE FROM %s WHERE id = %d", tableName, i * 2);
220+
}
221+
222+
Table table = ops.getTable(tableName);
223+
224+
// Verify we have 4 data files and position delete files
225+
List<Object[]> dataFileCountResult = sql("SELECT count(*) FROM %s.data_files", tableName);
226+
assertThat((long) dataFileCountResult.get(0)[0]).isEqualTo(4L);
227+
228+
List<Object[]> deleteFileCountResult = sql("SELECT count(*) FROM %s.delete_files", tableName);
229+
assertThat((long) deleteFileCountResult.get(0)[0]).isGreaterThanOrEqualTo(4L);
230+
231+
// Compute budget as half of total data file size (by file_size_in_bytes from metadata,
232+
// excluding delete file sizes). If the old sizeBytes-based grouping was used, each task
233+
// would appear much larger (data + all partition-scoped delete files), and the budget
234+
// would cover fewer files.
235+
List<Object[]> totalSizeResult =
236+
sql("SELECT sum(file_size_in_bytes) FROM %s.data_files", tableName);
237+
long totalDataSize = (long) totalSizeResult.get(0)[0];
238+
// add margin to total data size, file sizes are roughly the same but can vary by a few bytes
239+
long margin = totalDataSize / 10;
240+
long halfBudget = totalDataSize / 2 + margin;
241+
242+
// Set target-file-size-bytes to the total size of 2 data files. With the length-based
243+
// grouping fix (linkedin/iceberg#233), the 2 rewritten data files are grouped into a
244+
// single task and merged into 1 output file. If sizeBytes (data + all partition-scoped
245+
// delete files) was used instead, each task would appear much larger than the target,
246+
// preventing them from being grouped together and producing 2 separate output files.
247+
long targetSize = halfBudget;
248+
249+
log.info(
250+
"Budgeted rewrite test: totalDataSize={}, halfBudget={}, targetSize={}",
251+
totalDataSize,
252+
halfBudget,
253+
targetSize);
254+
255+
// Use SparkActions directly instead of ops.rewriteDataFiles() because this test requires
256+
// fine-grained control over budget options (MAX_TOTAL_FILES_SIZE_BYTES, target-file-size-bytes)
257+
// that are not exposed through the Operations API.
258+
RewriteDataFiles.Result result =
259+
SparkActions.get(ops.spark())
260+
.rewriteDataFiles(table)
261+
.binPack()
262+
.option(RewriteDataFiles.MAX_TOTAL_FILES_SIZE_BYTES, Long.toString(halfBudget))
263+
.option("target-file-size-bytes", Long.toString(targetSize))
264+
.option("min-file-size-bytes", "1")
265+
.option("max-file-size-bytes", Long.toString(targetSize * 2))
266+
.option("min-input-files", "1")
267+
.option("delete-file-threshold", "0")
268+
.execute();
269+
270+
// Budget covers exactly half the data files by length.
271+
Assertions.assertEquals(2, result.rewrittenDataFilesCount());
272+
// With length-based grouping, the 2 data files (total size = targetSize) fit in one group
273+
// and merge into 1 output file. With sizeBytes-based grouping, each file's perceived size
274+
// would be data_length + totalDeleteSize, far exceeding the target, so they would be
275+
// placed in separate groups producing 2 output files instead.
276+
Assertions.assertEquals(1, result.addedDataFilesCount());
277+
278+
// Verify data correctness: only odd-numbered IDs remain (even IDs were deleted)
279+
List<Object[]> expected =
280+
Arrays.asList(row(1, "data_0"), row(3, "data_1"), row(5, "data_2"), row(7, "data_3"));
281+
List<Object[]> actual = sql("SELECT * FROM %s ORDER BY id ASC", tableName);
282+
assertThat(actual).containsExactlyElementsOf(expected);
283+
}
284+
198285
private void writeEqDeleteRecord(Table table, String delCol, Object delVal) {
199286
List<Integer> equalityFieldIds = Lists.newArrayList(table.schema().findField(delCol).fieldId());
200287
Schema eqDeleteRowSchema = table.schema().select(delCol);

build.gradle

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ ext {
3030
spark_version = "3.1.1"
3131
ok_http3_version = "4.11.0"
3232
junit_version = "5.11.0"
33-
iceberg_1_2_version = "1.2.0.6"
34-
iceberg_1_5_version = "1.5.2.7"
33+
iceberg_1_2_version = "1.2.0.11"
34+
iceberg_1_5_version = "1.5.2.8"
3535
otel_agent_version = "2.12.0" // Bundles OTel SDK 1.47.0
3636
otel_annotations_version = "2.12.0" // Match agent version
3737
}

integrations/python/dataloader/CLAUDE.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,27 @@ When validating a change, always run both:
2525
1. `make verify` — lint, format checks, and unit tests
2626
2. Integration tests against Docker OpenHouse — start the Docker services, then run `make integration-tests`. These test the dataloader end-to-end against a real OpenHouse instance and must pass before a change is considered correct.
2727

28+
Run `make format` before pushing to avoid CI formatting failures.
29+
2830
```bash
2931
# From the repo root, start Docker services (once per session):
30-
docker compose -f infra/recipes/docker-compose/oh-only/docker-compose.yml up -d
32+
docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml up -d
3133

3234
# From the dataloader directory:
35+
make format
3336
make verify
3437
make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
3538
```
3639

3740
## Integration Tests
3841

39-
Integration tests run against an OpenHouse instance in Docker. To run them:
42+
Integration tests run inside a Docker container on the same network as the oh-hadoop-spark services. The `make integration-tests` target builds a test image and runs it automatically. Tables are created and populated via Spark SQL submitted through Livy.
4043

4144
1. Start the Docker services from the repo root:
4245
```bash
43-
docker compose -f infra/recipes/docker-compose/oh-only/docker-compose.yml up -d
46+
docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml up -d
4447
```
45-
2. Run the tests with the dummy token (uses `DummyTokenInterceptor`, no real auth needed):
48+
2. Wait for all services to be healthy (especially Livy and namenode), then run:
4649
```bash
4750
make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
4851
```

integrations/python/dataloader/Makefile

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: help sync clean lint format format-check typecheck check test verify build package-check integration-tests
1+
.PHONY: help sync clean lint format format-check typecheck check test verify build package-check build-itest integration-tests
22

33
help:
44
@echo "Available commands:"
@@ -41,13 +41,27 @@ build:
4141
package-check:
4242
uv run twine check dist/*
4343

44-
integration-tests:
45-
uv run python tests/integration_tests.py $(TOKEN_FILE)
44+
ITEST_IMAGE ?= oh-dataloader-itest
45+
DOCKER_NETWORK ?= oh-hadoop-spark_default
46+
ITEST_SITE = build/itest-site-packages
47+
48+
build-itest: $(ITEST_SITE)
49+
50+
$(ITEST_SITE): pyproject.toml uv.lock src/
51+
rm -rf $(ITEST_SITE)
52+
SETUPTOOLS_SCM_PRETEND_VERSION=0.0.0 uv pip install --target $(ITEST_SITE) --python-platform x86_64-unknown-linux-gnu --python-version 3.12 ".[dev]"
53+
54+
integration-tests: $(ITEST_SITE)
55+
docker build --platform linux/amd64 -t $(ITEST_IMAGE) -f tests/Dockerfile .
56+
docker run --rm --platform linux/amd64 --network $(DOCKER_NETWORK) \
57+
-e OH_TOKEN="$$(cat $(TOKEN_FILE))" \
58+
$(ITEST_IMAGE) python3 tests/integration_tests.py
4659

4760
clean:
4861
rm -rf build/
4962
rm -rf dist/
5063
rm -rf *.egg-info
5164
rm -rf .venv/
65+
rm -rf $(ITEST_SITE)
5266
find . -type d -name __pycache__ -exec rm -rf {} +
5367
find . -type f -name "*.pyc" -delete

integrations/python/dataloader/src/openhouse/dataloader/data_loader_split.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import hashlib
34
from collections.abc import Iterator, Mapping
45
from types import MappingProxyType
56

@@ -27,6 +28,14 @@ def __init__(
2728
self._udf_registry = udf_registry or NoOpRegistry()
2829
self._scan_context = scan_context
2930

31+
@property
32+
def id(self) -> str:
33+
"""Unique ID for the split. This is stable across executions for a given
34+
snapshot and split size.
35+
"""
36+
file_path = self._file_scan_task.file.file_path
37+
return hashlib.sha256(file_path.encode("utf-8")).hexdigest()
38+
3039
@property
3140
def table_properties(self) -> Mapping[str, str]:
3241
"""Properties of the table being loaded"""
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Minimal runtime image with Java 8 + Hadoop 2.8 for HDFS access.
2+
# Dependencies are pre-installed into build/itest-site-packages on the host and copied in.
3+
FROM python:3.12-slim-bookworm
4+
5+
# Copy JRE 8 and Hadoop 2.8 client from the hadoop base image.
6+
# JRE is needed by PyArrow's libhdfs JNI bridge to read HDFS data.
7+
# Hadoop 2.8 requires Java 8 — it is incompatible with Java 17 (NoClassDefFoundError).
8+
COPY --from=bde2020/hadoop-namenode:1.2.0-hadoop2.8-java8 /usr/lib/jvm/java-8-openjdk-amd64/jre /opt/java
9+
COPY --from=bde2020/hadoop-namenode:1.2.0-hadoop2.8-java8 /opt/hadoop-2.8.0 /opt/hadoop-2.8.0
10+
11+
ENV JAVA_HOME=/opt/java
12+
ENV HADOOP_HOME=/opt/hadoop-2.8.0
13+
ENV ARROW_LIBHDFS_DIR="${HADOOP_HOME}/lib/native"
14+
15+
# Expand CLASSPATH globs at build time. JNI does not expand '*' wildcards the
16+
# way the java launcher does, so we must list every jar explicitly.
17+
RUN echo "${HADOOP_HOME}/etc/hadoop" > /tmp/cp_parts && \
18+
find ${HADOOP_HOME}/share/hadoop/common ${HADOOP_HOME}/share/hadoop/common/lib \
19+
${HADOOP_HOME}/share/hadoop/hdfs ${HADOOP_HOME}/share/hadoop/hdfs/lib \
20+
-maxdepth 1 -name '*.jar' >> /tmp/cp_parts && \
21+
paste -sd ':' /tmp/cp_parts > /tmp/cp && \
22+
rm /tmp/cp_parts
23+
ENV CLASSPATH_FILE=/tmp/cp
24+
25+
WORKDIR /app
26+
COPY build/itest-site-packages/ /app/site-packages/
27+
ENV PYTHONPATH=/app/site-packages
28+
COPY tests/ tests/
29+
30+
# Set CLASSPATH from the expanded file at container start.
31+
ENTRYPOINT ["sh", "-c", "export CLASSPATH=$(cat /tmp/cp) && exec \"$@\"", "--"]

0 commit comments

Comments
 (0)