diff --git a/‎.github/workflows/build-run-tests.yml‎
Lines changed: 0 additions & 13 deletions b/‎.github/workflows/build-run-tests.yml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎.github/workflows/dataloader-tests.yml‎
Lines changed: 75 additions & 0 deletions b/‎.github/workflows/dataloader-tests.yml‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎apps/spark-3.5/src/test/java/com/linkedin/openhouse/catalog/e2e/SparkMoRFunctionalTest.java‎
Lines changed: 87 additions & 0 deletions b/‎apps/spark-3.5/src/test/java/com/linkedin/openhouse/catalog/e2e/SparkMoRFunctionalTest.java‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎build.gradle‎
Lines changed: 2 additions & 2 deletions b/‎build.gradle‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎integrations/python/dataloader/CLAUDE.md‎
Lines changed: 7 additions & 4 deletions b/‎integrations/python/dataloader/CLAUDE.md‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎integrations/python/dataloader/Makefile‎
Lines changed: 17 additions & 3 deletions b/‎integrations/python/dataloader/Makefile‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎integrations/python/dataloader/src/openhouse/dataloader/data_loader_split.py‎
Lines changed: 9 additions & 0 deletions b/‎integrations/python/dataloader/src/openhouse/dataloader/data_loader_split.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎integrations/python/dataloader/tests/Dockerfile‎
Lines changed: 31 additions & 0 deletions b/‎integrations/python/dataloader/tests/Dockerfile‎
Lines changed: 31 additions & 0 deletions
@@ -34,19 +34,6 @@ jobs:
         with:
           python-version: '3.12'
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-        with:
-          enable-cache: true
-
-      - name: Run Data Loader Tests
-        working-directory: integrations/python/dataloader
-        run: make sync verify
-
-      - name: Run Data Loader Integration Tests
-        working-directory: integrations/python/dataloader
-        run: make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
-
       - name: Install dependencies
         run: pip install -r scripts/python/requirements.txt
 
 
@@ -0,0 +1,75 @@
+name: Dataloader Tests
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches:
+      - main
+    paths:
+      - 'integrations/python/dataloader/**'
+
+concurrency:
+  group: dataloader-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: Dataloader Build and Test
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout project sources
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Run Data Loader Tests
+        working-directory: integrations/python/dataloader
+        run: make sync verify
+
+      - name: Set up JDK 17
+        uses: actions/setup-java@v5
+        with:
+          distribution: 'microsoft'
+          java-version: '17'
+
+      - name: Setup Gradle
+        uses: gradle/actions/setup-gradle@v5
+
+      - name: Build project and Spark uber JARs
+        run: ./gradlew clean build shadowJar -x test
+
+      - name: Start Docker Containers
+        run: docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml up -d --build
+
+      - name: Wait for Docker Containers to be ready
+        run: |
+          echo "Waiting for openhouse-tables and spark-livy..."
+          for i in $(seq 1 120); do
+            tables_status=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/v1/databases 2>&1 || echo "000")
+            livy_status=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:9003/sessions 2>&1 || echo "000")
+            echo "Poll $i: tables=$tables_status livy=$livy_status"
+            if [ "$tables_status" != "000" ] && [ "$livy_status" != "000" ]; then
+              echo "Services ready after $((i * 5))s"
+              exit 0
+            fi
+            sleep 5
+          done
+          echo "Timed out waiting for services after 600s"
+          docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml ps
+          exit 1
+
+      - name: Run Data Loader Integration Tests
+        working-directory: integrations/python/dataloader
+        run: make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
+
+      - name: Stop Docker Containers
+        if: always()
+        run: docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml down
@@ -26,6 +26,7 @@
 import org.apache.iceberg.io.OutputFileFactory;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.actions.SparkActions;
 import org.apache.iceberg.util.ArrayUtil;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
@@ -195,6 +196,92 @@ public void testCompactionCanRemoveEqualityDeleteFiles() throws NoSuchTableExcep
     assertThat(stats.getNumCurrentSnapshotEqualityDeleteFiles()).isEqualTo(0L);
   }
 
+  @Test
+  public void testBudgetedRewriteUsesDataLengthForTaskGrouping() throws NoSuchTableException {
+    createAndInitTable("id int, data string");
+
+    // Create 4 separate data files by appending individually
+    for (int i = 0; i < 4; i++) {
+      List<SimpleRecord> records =
+          Arrays.asList(
+              new SimpleRecord(i * 2, "data_" + i), new SimpleRecord(i * 2 + 1, "data_" + i));
+      ops.spark()
+          .createDataset(records, Encoders.bean(SimpleRecord.class))
+          .coalesce(1)
+          .writeTo(tableName)
+          .append();
+    }
+
+    // Delete one row from each data file to produce partition-scoped position delete files.
+    // In an unpartitioned table, all position deletes are in the same partition and thus
+    // associated with ALL data files, inflating each task's sizeBytes relative to its length.
+    for (int i = 0; i < 4; i++) {
+      sql("DELETE FROM %s WHERE id = %d", tableName, i * 2);
+    }
+
+    Table table = ops.getTable(tableName);
+
+    // Verify we have 4 data files and position delete files
+    List<Object[]> dataFileCountResult = sql("SELECT count(*) FROM %s.data_files", tableName);
+    assertThat((long) dataFileCountResult.get(0)[0]).isEqualTo(4L);
+
+    List<Object[]> deleteFileCountResult = sql("SELECT count(*) FROM %s.delete_files", tableName);
+    assertThat((long) deleteFileCountResult.get(0)[0]).isGreaterThanOrEqualTo(4L);
+
+    // Compute budget as half of total data file size (by file_size_in_bytes from metadata,
+    // excluding delete file sizes). If the old sizeBytes-based grouping was used, each task
+    // would appear much larger (data + all partition-scoped delete files), and the budget
+    // would cover fewer files.
+    List<Object[]> totalSizeResult =
+        sql("SELECT sum(file_size_in_bytes) FROM %s.data_files", tableName);
+    long totalDataSize = (long) totalSizeResult.get(0)[0];
+    // add margin to total data size, file sizes are roughly the same but can vary by a few bytes
+    long margin = totalDataSize / 10;
+    long halfBudget = totalDataSize / 2 + margin;
+
+    // Set target-file-size-bytes to the total size of 2 data files. With the length-based
+    // grouping fix (linkedin/iceberg#233), the 2 rewritten data files are grouped into a
+    // single task and merged into 1 output file. If sizeBytes (data + all partition-scoped
+    // delete files) was used instead, each task would appear much larger than the target,
+    // preventing them from being grouped together and producing 2 separate output files.
+    long targetSize = halfBudget;
+
+    log.info(
+        "Budgeted rewrite test: totalDataSize={}, halfBudget={}, targetSize={}",
+        totalDataSize,
+        halfBudget,
+        targetSize);
+
+    // Use SparkActions directly instead of ops.rewriteDataFiles() because this test requires
+    // fine-grained control over budget options (MAX_TOTAL_FILES_SIZE_BYTES, target-file-size-bytes)
+    // that are not exposed through the Operations API.
+    RewriteDataFiles.Result result =
+        SparkActions.get(ops.spark())
+            .rewriteDataFiles(table)
+            .binPack()
+            .option(RewriteDataFiles.MAX_TOTAL_FILES_SIZE_BYTES, Long.toString(halfBudget))
+            .option("target-file-size-bytes", Long.toString(targetSize))
+            .option("min-file-size-bytes", "1")
+            .option("max-file-size-bytes", Long.toString(targetSize * 2))
+            .option("min-input-files", "1")
+            .option("delete-file-threshold", "0")
+            .execute();
+
+    // Budget covers exactly half the data files by length.
+    Assertions.assertEquals(2, result.rewrittenDataFilesCount());
+    // With length-based grouping, the 2 data files (total size = targetSize) fit in one group
+    // and merge into 1 output file. With sizeBytes-based grouping, each file's perceived size
+    // would be data_length + totalDeleteSize, far exceeding the target, so they would be
+    // placed in separate groups producing 2 output files instead.
+    Assertions.assertEquals(1, result.addedDataFilesCount());
+
+    // Verify data correctness: only odd-numbered IDs remain (even IDs were deleted)
+    List<Object[]> expected =
+        Arrays.asList(row(1, "data_0"), row(3, "data_1"), row(5, "data_2"), row(7, "data_3"));
+    List<Object[]> actual = sql("SELECT * FROM %s ORDER BY id ASC", tableName);
+    assertThat(actual).containsExactlyElementsOf(expected);
+  }
+
   private void writeEqDeleteRecord(Table table, String delCol, Object delVal) {
     List<Integer> equalityFieldIds = Lists.newArrayList(table.schema().findField(delCol).fieldId());
     Schema eqDeleteRowSchema = table.schema().select(delCol);
 
@@ -30,8 +30,8 @@ ext {
   spark_version = "3.1.1"
   ok_http3_version = "4.11.0"
   junit_version = "5.11.0"
-  iceberg_1_2_version = "1.2.0.6"
-  iceberg_1_5_version = "1.5.2.7"
+  iceberg_1_2_version = "1.2.0.11"
+  iceberg_1_5_version = "1.5.2.8"
   otel_agent_version = "2.12.0" // Bundles OTel SDK 1.47.0
   otel_annotations_version = "2.12.0" // Match agent version
 }
 
@@ -25,24 +25,27 @@ When validating a change, always run both:
 1. `make verify` — lint, format checks, and unit tests
 2. Integration tests against Docker OpenHouse — start the Docker services, then run `make integration-tests`. These test the dataloader end-to-end against a real OpenHouse instance and must pass before a change is considered correct.
 
+Run `make format` before pushing to avoid CI formatting failures.
+
 ```bash
 # From the repo root, start Docker services (once per session):
-docker compose -f infra/recipes/docker-compose/oh-only/docker-compose.yml up -d
+docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml up -d
 
 # From the dataloader directory:
+make format
 make verify
 make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
 ```
 
 ## Integration Tests
 
-Integration tests run against an OpenHouse instance in Docker. To run them:
+Integration tests run inside a Docker container on the same network as the oh-hadoop-spark services. The `make integration-tests` target builds a test image and runs it automatically. Tables are created and populated via Spark SQL submitted through Livy.
 
 1. Start the Docker services from the repo root:
    ```bash
-   docker compose -f infra/recipes/docker-compose/oh-only/docker-compose.yml up -d
+   docker compose -f infra/recipes/docker-compose/oh-hadoop-spark/docker-compose.yml up -d
    ```
-2. Run the tests with the dummy token (uses `DummyTokenInterceptor`, no real auth needed):
+2. Wait for all services to be healthy (especially Livy and namenode), then run:
    ```bash
    make integration-tests TOKEN_FILE=../../../tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token
    ```
 
@@ -1,4 +1,4 @@
-.PHONY: help sync clean lint format format-check typecheck check test verify build package-check integration-tests
+.PHONY: help sync clean lint format format-check typecheck check test verify build package-check build-itest integration-tests
 
 help:
 	@echo "Available commands:"
@@ -41,13 +41,27 @@ build:
 package-check:
 	uv run twine check dist/*
 
-integration-tests:
-	uv run python tests/integration_tests.py $(TOKEN_FILE)
+ITEST_IMAGE ?= oh-dataloader-itest
+DOCKER_NETWORK ?= oh-hadoop-spark_default
+ITEST_SITE = build/itest-site-packages
+
+build-itest: $(ITEST_SITE)
+
+$(ITEST_SITE): pyproject.toml uv.lock src/
+	rm -rf $(ITEST_SITE)
+	SETUPTOOLS_SCM_PRETEND_VERSION=0.0.0 uv pip install --target $(ITEST_SITE) --python-platform x86_64-unknown-linux-gnu --python-version 3.12 ".[dev]"
+
+integration-tests: $(ITEST_SITE)
+	docker build --platform linux/amd64 -t $(ITEST_IMAGE) -f tests/Dockerfile .
+	docker run --rm --platform linux/amd64 --network $(DOCKER_NETWORK) \
+		-e OH_TOKEN="$$(cat $(TOKEN_FILE))" \
+		$(ITEST_IMAGE) python3 tests/integration_tests.py
 
 clean:
 	rm -rf build/
 	rm -rf dist/
 	rm -rf *.egg-info
 	rm -rf .venv/
+	rm -rf $(ITEST_SITE)
 	find . -type d -name __pycache__ -exec rm -rf {} +
 	find . -type f -name "*.pyc" -delete
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import hashlib
 from collections.abc import Iterator, Mapping
 from types import MappingProxyType
 
@@ -27,6 +28,14 @@ def __init__(
         self._udf_registry = udf_registry or NoOpRegistry()
         self._scan_context = scan_context
 
+    @property
+    def id(self) -> str:
+        """Unique ID for the split. This is stable across executions for a given
+        snapshot and split size.
+        """
+        file_path = self._file_scan_task.file.file_path
+        return hashlib.sha256(file_path.encode("utf-8")).hexdigest()
+
     @property
     def table_properties(self) -> Mapping[str, str]:
         """Properties of the table being loaded"""
 
@@ -0,0 +1,31 @@
+# Minimal runtime image with Java 8 + Hadoop 2.8 for HDFS access.
+# Dependencies are pre-installed into build/itest-site-packages on the host and copied in.
+FROM python:3.12-slim-bookworm
+
+# Copy JRE 8 and Hadoop 2.8 client from the hadoop base image.
+# JRE is needed by PyArrow's libhdfs JNI bridge to read HDFS data.
+# Hadoop 2.8 requires Java 8 — it is incompatible with Java 17 (NoClassDefFoundError).
+COPY --from=bde2020/hadoop-namenode:1.2.0-hadoop2.8-java8 /usr/lib/jvm/java-8-openjdk-amd64/jre /opt/java
+COPY --from=bde2020/hadoop-namenode:1.2.0-hadoop2.8-java8 /opt/hadoop-2.8.0 /opt/hadoop-2.8.0
+
+ENV JAVA_HOME=/opt/java
+ENV HADOOP_HOME=/opt/hadoop-2.8.0
+ENV ARROW_LIBHDFS_DIR="${HADOOP_HOME}/lib/native"
+
+# Expand CLASSPATH globs at build time. JNI does not expand '*' wildcards the
+# way the java launcher does, so we must list every jar explicitly.
+RUN echo "${HADOOP_HOME}/etc/hadoop" > /tmp/cp_parts && \
+    find ${HADOOP_HOME}/share/hadoop/common ${HADOOP_HOME}/share/hadoop/common/lib \
+         ${HADOOP_HOME}/share/hadoop/hdfs ${HADOOP_HOME}/share/hadoop/hdfs/lib \
+         -maxdepth 1 -name '*.jar' >> /tmp/cp_parts && \
+    paste -sd ':' /tmp/cp_parts > /tmp/cp && \
+    rm /tmp/cp_parts
+ENV CLASSPATH_FILE=/tmp/cp
+
+WORKDIR /app
+COPY build/itest-site-packages/ /app/site-packages/
+ENV PYTHONPATH=/app/site-packages
+COPY tests/ tests/
+
+# Set CLASSPATH from the expanded file at container start.
+ENTRYPOINT ["sh", "-c", "export CLASSPATH=$(cat /tmp/cp) && exec \"$@\"", "--"]