Skip to content

Commit dfd33a9

Browse files
changes
1 parent 3f440be commit dfd33a9

7 files changed

Lines changed: 1580 additions & 0 deletions

File tree

java/NEXT_CHANGELOG.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66

77
### New Features and Improvements
88

9+
- **Arrow Flight Support (Experimental)**: Added support for ingesting Apache Arrow `VectorSchemaRoot` batches via Arrow Flight protocol
10+
- **Note**: Arrow Flight is not yet supported by default from the Zerobus server side.
11+
- New `ZerobusArrowStream` class with `ingestBatch()`, `waitForOffset()`, `flush()`, `close()`, `getUnackedBatches()` methods
12+
- New `ArrowStreamConfigurationOptions` for configuring Arrow streams (max inflight batches, recovery, timeouts)
13+
- New `createArrowStream()` and `recreateArrowStream()` methods on `ZerobusSdk`
14+
- Accepts `VectorSchemaRoot` directly via `ingestBatch()` (IPC serialization handled internally)
15+
- Arrow is opt-in: add `arrow-vector` and `arrow-memory-netty` as dependencies (provided scope, `>= 15.0.0`)
16+
917
### Bug Fixes
1018

1119
- Fixed proto generation tool to skip reserved field numbers 19000-19999 for tables with more than 19000 columns
@@ -14,9 +22,20 @@
1422

1523
### Internal Changes
1624

25+
- Added `arrow-vector` 17.0.0 as provided dependency for Arrow Flight support
26+
- Added `arrow-memory-netty` 17.0.0 as test dependency for integration tests
27+
- Uses existing JNI Arrow Flight bindings from Rust SDK (`nativeCreateArrowStream`, `nativeIngestBatch`, etc.)
28+
1729
### Breaking Changes
1830

1931
### Deprecations
2032

2133
### API Changes
2234

35+
- Added `createArrowStream(String tableName, Schema schema, String clientId, String clientSecret)` to `ZerobusSdk`
36+
- Added `createArrowStream(String tableName, Schema schema, String clientId, String clientSecret, ArrowStreamConfigurationOptions options)` to `ZerobusSdk`
37+
- Added `recreateArrowStream(ZerobusArrowStream closedStream)` to `ZerobusSdk`
38+
- Added `ZerobusArrowStream` class with methods: `ingestBatch()`, `waitForOffset()`, `flush()`, `close()`, `getUnackedBatches()`, `isClosed()`, `getTableName()`, `getOptions()`
39+
- Added `ArrowStreamConfigurationOptions` class with fields: `maxInflightBatches`, `recovery`, `recoveryTimeoutMs`, `recoveryBackoffMs`, `recoveryRetries`, `serverLackOfAckTimeoutMs`, `flushTimeoutMs`, `connectionTimeoutMs`
40+
- Added optional dependency: `org.apache.arrow:arrow-vector >= 15.0.0` (provided scope)
41+
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
package com.databricks.zerobus.examples.arrow;
2+
3+
import com.databricks.zerobus.*;
4+
import java.util.Arrays;
5+
import java.util.List;
6+
import org.apache.arrow.memory.BufferAllocator;
7+
import org.apache.arrow.memory.RootAllocator;
8+
import org.apache.arrow.vector.BigIntVector;
9+
import org.apache.arrow.vector.IntVector;
10+
import org.apache.arrow.vector.LargeVarCharVector;
11+
import org.apache.arrow.vector.VectorSchemaRoot;
12+
import org.apache.arrow.vector.types.pojo.ArrowType;
13+
import org.apache.arrow.vector.types.pojo.Field;
14+
import org.apache.arrow.vector.types.pojo.Schema;
15+
16+
/**
17+
* Arrow Flight ingestion example.
18+
*
19+
* <p>Demonstrates ingesting columnar data using Apache Arrow record batches via the Arrow Flight
20+
* protocol. This provides high-performance ingestion for large datasets.
21+
*
22+
* <p>Prerequisites:
23+
*
24+
* <ul>
25+
* <li>A Delta table with columns: device_name (STRING), temp (INT), humidity (BIGINT)
26+
* <li>Apache Arrow Java libraries on the classpath (arrow-vector, arrow-memory-netty)
27+
* </ul>
28+
*
29+
* <p>Run with: {@code java -cp <classpath> com.databricks.zerobus.examples.arrow.ArrowIngestionExample}
30+
*/
31+
public class ArrowIngestionExample {
32+
33+
public static void main(String[] args) throws Exception {
34+
String serverEndpoint = System.getenv("ZEROBUS_SERVER_ENDPOINT");
35+
String workspaceUrl = System.getenv("DATABRICKS_WORKSPACE_URL");
36+
String tableName = System.getenv("ZEROBUS_TABLE_NAME");
37+
String clientId = System.getenv("DATABRICKS_CLIENT_ID");
38+
String clientSecret = System.getenv("DATABRICKS_CLIENT_SECRET");
39+
40+
if (serverEndpoint == null
41+
|| workspaceUrl == null
42+
|| tableName == null
43+
|| clientId == null
44+
|| clientSecret == null) {
45+
System.err.println("Error: Required environment variables not set.");
46+
System.err.println(
47+
"Set: ZEROBUS_SERVER_ENDPOINT, DATABRICKS_WORKSPACE_URL, ZEROBUS_TABLE_NAME,");
48+
System.err.println(" DATABRICKS_CLIENT_ID, DATABRICKS_CLIENT_SECRET");
49+
System.exit(1);
50+
}
51+
52+
System.out.println("=== Arrow Flight Ingestion Example ===\n");
53+
54+
// Define the Arrow schema matching the Delta table
55+
Schema schema =
56+
new Schema(
57+
Arrays.asList(
58+
Field.nullable("device_name", ArrowType.LargeUtf8.INSTANCE),
59+
Field.nullable("temp", new ArrowType.Int(32, true)),
60+
Field.nullable("humidity", new ArrowType.Int(64, true))));
61+
62+
try (BufferAllocator allocator = new RootAllocator();
63+
ZerobusSdk sdk = new ZerobusSdk(serverEndpoint, workspaceUrl)) {
64+
65+
// === Single batch ingestion ===
66+
System.out.println("--- Single Batch Ingestion ---");
67+
68+
ZerobusArrowStream stream =
69+
sdk.createArrowStream(tableName, schema, clientId, clientSecret).join();
70+
71+
try {
72+
try (VectorSchemaRoot batch = VectorSchemaRoot.create(schema, allocator)) {
73+
LargeVarCharVector nameVector = (LargeVarCharVector) batch.getVector("device_name");
74+
IntVector tempVector = (IntVector) batch.getVector("temp");
75+
BigIntVector humidityVector = (BigIntVector) batch.getVector("humidity");
76+
77+
int rowCount = 5;
78+
batch.allocateNew();
79+
for (int i = 0; i < rowCount; i++) {
80+
nameVector.setSafe(i, ("arrow-device-" + i).getBytes());
81+
tempVector.setSafe(i, 20 + i);
82+
humidityVector.setSafe(i, 50 + i);
83+
}
84+
batch.setRowCount(rowCount);
85+
86+
long offset = stream.ingestBatch(batch).get();
87+
stream.waitForOffset(offset);
88+
System.out.println(
89+
" " + rowCount + " rows ingested and acknowledged (offset: " + offset + ")");
90+
}
91+
92+
// === Multiple batch ingestion ===
93+
System.out.println("\n--- Multiple Batch Ingestion ---");
94+
95+
long lastOffset = -1;
96+
for (int batchNum = 0; batchNum < 3; batchNum++) {
97+
try (VectorSchemaRoot batch = VectorSchemaRoot.create(schema, allocator)) {
98+
LargeVarCharVector nameVector = (LargeVarCharVector) batch.getVector("device_name");
99+
IntVector tempVector = (IntVector) batch.getVector("temp");
100+
BigIntVector humidityVector = (BigIntVector) batch.getVector("humidity");
101+
102+
int rowCount = 10;
103+
batch.allocateNew();
104+
for (int i = 0; i < rowCount; i++) {
105+
nameVector.setSafe(i, ("arrow-batch-" + batchNum + "-row-" + i).getBytes());
106+
tempVector.setSafe(i, 30 + i);
107+
humidityVector.setSafe(i, 60 + i);
108+
}
109+
batch.setRowCount(rowCount);
110+
111+
lastOffset = stream.ingestBatch(batch).get();
112+
}
113+
}
114+
stream.flush();
115+
System.out.println(" 3 batches (30 rows total) ingested and flushed");
116+
117+
// === Custom options ===
118+
System.out.println("\n--- Custom Options ---");
119+
120+
ArrowStreamConfigurationOptions customOptions =
121+
ArrowStreamConfigurationOptions.builder()
122+
.setMaxInflightBatches(2000)
123+
.setFlushTimeoutMs(600000)
124+
.setRecovery(true)
125+
.setRecoveryRetries(5)
126+
.build();
127+
System.out.println(
128+
" maxInflightBatches: " + customOptions.maxInflightBatches());
129+
System.out.println(" flushTimeoutMs: " + customOptions.flushTimeoutMs());
130+
System.out.println(" recoveryRetries: " + customOptions.recoveryRetries());
131+
132+
} finally {
133+
stream.close();
134+
}
135+
136+
// === Demonstrate getUnackedBatches and recreateArrowStream ===
137+
System.out.println("\n--- Unacked Batches (after close) ---");
138+
139+
List<byte[]> unackedBatches = stream.getUnackedBatches();
140+
System.out.println(" Unacked batches: " + unackedBatches.size());
141+
System.out.println(" (Expected 0 after successful flush/close)");
142+
143+
System.out.println("\n--- Recreate Arrow Stream ---");
144+
145+
ZerobusArrowStream newStream = sdk.recreateArrowStream(stream).join();
146+
try {
147+
try (VectorSchemaRoot batch = VectorSchemaRoot.create(schema, allocator)) {
148+
LargeVarCharVector nameVector = (LargeVarCharVector) batch.getVector("device_name");
149+
IntVector tempVector = (IntVector) batch.getVector("temp");
150+
BigIntVector humidityVector = (BigIntVector) batch.getVector("humidity");
151+
152+
batch.allocateNew();
153+
nameVector.setSafe(0, "arrow-recreated".getBytes());
154+
tempVector.setSafe(0, 99);
155+
humidityVector.setSafe(0, 99);
156+
batch.setRowCount(1);
157+
158+
long offset = newStream.ingestBatch(batch).get();
159+
newStream.waitForOffset(offset);
160+
System.out.println(" 1 row ingested on recreated stream (offset: " + offset + ")");
161+
}
162+
} finally {
163+
newStream.close();
164+
}
165+
166+
System.out.println("\n=== Arrow Flight Example Complete ===");
167+
}
168+
}
169+
}

java/pom.xml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
<url>https://github.com/databricks/zerobus-sdk-java/tree/main</url>
2929
</scm>
3030
<properties>
31+
<arrow.version>17.0.0</arrow.version>
3132
<maven.compiler.source>1.8</maven.compiler.source>
3233
<maven.compiler.target>1.8</maven.compiler.target>
3334
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -45,6 +46,21 @@
4546
<artifactId>slf4j-api</artifactId>
4647
<version>2.0.17</version>
4748
</dependency>
49+
<!-- Apache Arrow (provided - users must supply arrow-vector >= 15.0.0 and a memory
50+
allocator such as arrow-memory-netty). Only needed for Arrow Flight ingestion. -->
51+
<dependency>
52+
<groupId>org.apache.arrow</groupId>
53+
<artifactId>arrow-vector</artifactId>
54+
<version>${arrow.version}</version>
55+
<scope>provided</scope>
56+
</dependency>
57+
<!-- Apache Arrow memory allocator (for tests only) -->
58+
<dependency>
59+
<groupId>org.apache.arrow</groupId>
60+
<artifactId>arrow-memory-netty</artifactId>
61+
<version>${arrow.version}</version>
62+
<scope>test</scope>
63+
</dependency>
4864
<!-- SLF4J Simple Implementation (for tests only) -->
4965
<dependency>
5066
<groupId>org.slf4j</groupId>

0 commit comments

Comments
 (0)