The PyArrow library offers a powerful API for Python that can be integrated with Java applications. This document provides a guide on how to enable seamless data exchange between Python and Java components using PyArrow.
This section demonstrates a data roundtrip where C Data interface is being used to provide the seamless access to data across language boundaries.
In the Python-based component, the data roundtrip process is demonstrated through a sequential workflow.
- Create data in Python
- Export data to Java
- Import updated data from Java
- Validate the data consistency
The Python code uses jpype to start the JVM and make the Java class MapValuesConsumer available to Python. Data is generated in PyArrow and exported through C Data to Java.
import jpype
import jpype.imports
from jpype.types import JClass
import pyarrow as pa
from pyarrow.cffi import ffi as arrow_c
# Init the JVM and make MapValuesConsumer class available to Python.
jpype.startJVM(classpath=[ "../target/*"])
java_c_package = jpype.JPackage("org").apache.arrow.c
MapValuesConsumer = JClass('MapValuesConsumer')
CDataDictionaryProvider = JClass('org.apache.arrow.c.CDataDictionaryProvider')
# Starting from Python and generating data
# Create a Python DictionaryArray
dictionary = pa.dictionary(pa.int64(), pa.utf8())
array = pa.array(["A", "B", "C", "A", "D"], dictionary)
print("From Python")
print("Dictionary Created:", array)
# create the CDataDictionaryProvider instance which is
# required to create dictionary array precisely
c_provider = CDataDictionaryProvider()
consumer = MapValuesConsumer(c_provider)
# Export the Python array through C Data
c_array = arrow_c.new("struct ArrowArray*")
c_array_ptr = int(arrow_c.cast("uintptr_t", c_array))
array._export_to_c(c_array_ptr)
# Export the Schema of the Array through C Data
c_schema = arrow_c.new("struct ArrowSchema*")
c_schema_ptr = int(arrow_c.cast("uintptr_t", c_schema))
array.type._export_to_c(c_schema_ptr)
# Send Array and its Schema to the Java function
consumer.callToJava(c_array_ptr, c_schema_ptr)
# Importing updated values from Java to Python
# Export the Python array through C Data
c_array_from_java = arrow_c.new("struct ArrowArray*")
c_array_ptr_from_java = int(arrow_c.cast("uintptr_t", c_array_from_java))
# Export the Schema of the Array through C Data
c_schema_from_java = arrow_c.new("struct ArrowSchema*")
c_schema_ptr_from_java = int(arrow_c.cast("uintptr_t", c_schema_from_java))
java_wrapped_array = java_c_package.ArrowArray.wrap(c_array_ptr_from_java)
java_wrapped_schema = java_c_package.ArrowSchema.wrap(c_schema_ptr_from_java)
java_c_package.Data.exportVector(
consumer.getAllocatorForJavaConsumer(),
consumer.getVector(),
c_provider,
java_wrapped_array,
java_wrapped_schema
)
print("From Java back to Python")
array_from_java = pa.Array._import_from_c(c_array_ptr_from_java, c_schema_ptr_from_java)
# In Java and Python, the same memory is being accessed through the C Data interface.
# Since the array from Java and array created in Python should have same data.
assert array_from_java.equals(array)
print("Array from Java: ", array_from_java)
# Releasing Java C Data source.
del array_from_java
consumer.close()
jpype.shutdownJVM()From Python
Dictionary Created:
-- dictionary:
[
"A",
"B",
"C",
"D"
]
-- indices:
[
0,
1,
2,
0,
3
]
Doing work in Java
From Java back to Python
Array from Java:
-- dictionary:
[
"A",
"B",
"C",
"D"
]
-- indices:
[
2,
1,
2,
0,
3
]In the Java-based component of the system, the following operations are executed:
- Receives data from the Python component.
- Updates the data.
- Exports the updated data back to Python.
MapValuesConsumer class uses C Data interface to access the data created in Python.
.. testcode::
import org.apache.arrow.c.ArrowArray;
import org.apache.arrow.c.ArrowSchema;
import org.apache.arrow.c.Data;
import org.apache.arrow.c.CDataDictionaryProvider;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.util.AutoCloseables;
class MapValuesConsumer implements AutoCloseable {
private final BufferAllocator allocator;
private final CDataDictionaryProvider provider;
private FieldVector vector;
private final BigIntVector intVector;
public MapValuesConsumer(CDataDictionaryProvider provider, BufferAllocator allocator) {
this.provider = provider;
this.allocator = allocator;
this.intVector = new BigIntVector("internal_test_vector", allocator);
}
public BufferAllocator getAllocatorForJavaConsumer() {
return allocator;
}
public FieldVector getVector() {
return this.vector;
}
public void update(long c_array_ptr, long c_schema_ptr) {
ArrowArray arrow_array = ArrowArray.wrap(c_array_ptr);
ArrowSchema arrow_schema = ArrowSchema.wrap(c_schema_ptr);
this.vector = Data.importVector(allocator, arrow_array, arrow_schema, this.provider);
this.doWorkInJava(vector);
}
public FieldVector updateFromJava(long c_array_ptr, long c_schema_ptr) {
ArrowArray arrow_array = ArrowArray.wrap(c_array_ptr);
ArrowSchema arrow_schema = ArrowSchema.wrap(c_schema_ptr);
this.vector = Data.importVector(allocator, arrow_array, arrow_schema, this.provider);
this.doWorkInJava(vector);
return vector;
}
private void doWorkInJava(FieldVector vector) {
System.out.println("Doing work in Java");
BigIntVector bigIntVector = (BigIntVector)vector;
bigIntVector.setSafe(0, 2);
}
public BigIntVector getIntVectorForJavaConsumer() {
intVector.allocateNew(3);
intVector.set(0, 1);
intVector.set(1, 7);
intVector.set(2, 93);
intVector.setValueCount(3);
return intVector;
}
@Override
public void close() throws Exception {
AutoCloseables.close(intVector);
}
}
try (BufferAllocator allocator = new RootAllocator()) {
CDataDictionaryProvider provider = new CDataDictionaryProvider();
try (final MapValuesConsumer mvc = new MapValuesConsumer(provider, allocator)) {
try (
ArrowArray arrowArray = ArrowArray.allocateNew(allocator);
ArrowSchema arrowSchema = ArrowSchema.allocateNew(allocator)
) {
Data.exportVector(allocator, mvc.getIntVectorForJavaConsumer(), provider, arrowArray,
arrowSchema);
FieldVector updatedVector = mvc.updateFromJava(arrowArray.memoryAddress(),
arrowSchema.memoryAddress());
try (ArrowArray usedArray = ArrowArray.allocateNew(allocator);
ArrowSchema usedSchema = ArrowSchema.allocateNew(allocator)) {
Data.exportVector(allocator, updatedVector, provider, usedArray, usedSchema);
try (FieldVector valueVectors = Data.importVector(allocator, usedArray, usedSchema,
provider)) {
System.out.println(valueVectors);
}
}
updatedVector.close();
} catch (Exception ex) {
ex.printStackTrace();
}
} catch (Exception ex) {
ex.printStackTrace();
}
} catch (Exception ex) {
ex.printStackTrace();
}
.. testoutput:: Doing work in Java [2, 7, 93]
By integrating PyArrow in Python and Java components, this example demonstrates that a system can be created where data is shared and updated across both languages seamlessly.