|
15 | 15 | // specific language governing permissions and limitations |
16 | 16 | // under the License. |
17 | 17 |
|
| 18 | +#include <arrow/compute/function.h> |
| 19 | +#include <arrow/compute/registry.h> |
18 | 20 | #include <cstdint> |
19 | 21 | #include <memory> |
20 | 22 | #include <string> |
| 23 | +#include <thread> |
21 | 24 | #include <tuple> |
22 | 25 | #include <vector> |
23 | 26 |
|
24 | 27 | #include <gmock/gmock.h> |
25 | 28 | #include <gtest/gtest.h> |
26 | 29 |
|
| 30 | +#include <arrow/dataset/dataset.h> |
| 31 | +#include <arrow/dataset/file_base.h> |
| 32 | +#include <arrow/record_batch.h> |
| 33 | +#include <arrow/util/async_generator.h> |
27 | 34 | #include "arrow/acero/exec_plan.h" |
28 | 35 | #include "arrow/acero/test_util_internal.h" |
29 | 36 | #include "arrow/array/array_primitive.h" |
30 | 37 | #include "arrow/compute/test_util_internal.h" |
31 | 38 | #include "arrow/dataset/api.h" |
32 | 39 | #include "arrow/dataset/partition.h" |
33 | 40 | #include "arrow/dataset/plan.h" |
| 41 | +#include "arrow/dataset/projector.h" |
34 | 42 | #include "arrow/dataset/test_util_internal.h" |
35 | 43 | #include "arrow/filesystem/path_util.h" |
36 | 44 | #include "arrow/filesystem/test_util.h" |
@@ -353,6 +361,165 @@ TEST_F(TestFileSystemDataset, WriteProjected) { |
353 | 361 | } |
354 | 362 | } |
355 | 363 |
|
| 364 | +// This kernel delays execution for some specific scalar values, |
| 365 | +// which guarantees the writing phase sees out-of-order exec batches |
| 366 | +Status delay(compute::KernelContext* ctx, const compute::ExecSpan& batch, |
| 367 | + compute::ExecResult* out) { |
| 368 | + const ArraySpan& input = batch[0].array; |
| 369 | + const auto* input_values = input.GetValues<uint32_t>(1); |
| 370 | + uint8_t* output_values = out->array_span()->buffers[1].data; |
| 371 | + |
| 372 | + // Boolean data is stored in 1 bit per value |
| 373 | + for (int64_t i = 0; i < input.length; ++i) { |
| 374 | + if (input_values[i] % 16 == 0) { |
| 375 | + std::this_thread::sleep_for(std::chrono::milliseconds(10)); |
| 376 | + } |
| 377 | + bit_util::SetBitTo(output_values, i, true); |
| 378 | + } |
| 379 | + |
| 380 | + return Status::OK(); |
| 381 | +} |
| 382 | + |
| 383 | +// A fragment with start=0 will defer ScanBatchesAsync returning a batch generator |
| 384 | +// This guarantees a dataset of multiple fragments could produce out-of-order batches |
| 385 | +class MockFragment : public Fragment { |
| 386 | + public: |
| 387 | + explicit MockFragment(uint32_t start, int64_t rows_per_batch, int num_batches, |
| 388 | + const std::shared_ptr<Schema>& schema) |
| 389 | + : Fragment(compute::literal(true), schema), |
| 390 | + start_(start), |
| 391 | + rows_per_batch_(rows_per_batch), |
| 392 | + num_batches_(num_batches) {} |
| 393 | + |
| 394 | + Result<RecordBatchGenerator> ScanBatchesAsync( |
| 395 | + const std::shared_ptr<ScanOptions>& options) override { |
| 396 | + // Fragment with start_=0 defers returning the generator |
| 397 | + if (start_ == 0) { |
| 398 | + std::this_thread::sleep_for(std::chrono::duration<double>(0.1)); |
| 399 | + } |
| 400 | + |
| 401 | + auto vec = gen::Gen({gen::Step(start_)}) |
| 402 | + ->FailOnError() |
| 403 | + ->RecordBatches(rows_per_batch_, num_batches_); |
| 404 | + auto it = MakeVectorIterator(vec); |
| 405 | + return MakeBackgroundGenerator(std::move(it), io::default_io_context().executor()); |
| 406 | + } |
| 407 | + |
| 408 | + std::string type_name() const override { return "mock"; } |
| 409 | + |
| 410 | + protected: |
| 411 | + Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override { |
| 412 | + return given_physical_schema_; |
| 413 | + }; |
| 414 | + |
| 415 | + private: |
| 416 | + uint32_t start_; |
| 417 | + int64_t rows_per_batch_; |
| 418 | + int num_batches_; |
| 419 | +}; |
| 420 | + |
| 421 | +// This dataset consists of multiple fragments with incrementing values across the |
| 422 | +// fragments |
| 423 | +class MockDataset : public Dataset { |
| 424 | + public: |
| 425 | + explicit MockDataset(const std::shared_ptr<Schema>& schema) : Dataset(schema) {} |
| 426 | + |
| 427 | + MockDataset(const std::shared_ptr<Schema>& schema, |
| 428 | + const compute::Expression& partition_expression) |
| 429 | + : Dataset(schema, partition_expression) {} |
| 430 | + |
| 431 | + std::string type_name() const override { return "mock"; } |
| 432 | + Result<std::shared_ptr<Dataset>> ReplaceSchema( |
| 433 | + std::shared_ptr<Schema> schema) const override { |
| 434 | + RETURN_NOT_OK(CheckProjectable(*schema_, *schema)); |
| 435 | + return std::make_shared<MockDataset>(std::move(schema)); |
| 436 | + } |
| 437 | + |
| 438 | + protected: |
| 439 | + Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override { |
| 440 | + FragmentVector fragments; |
| 441 | + fragments.push_back(std::make_shared<MockFragment>(0, 2, 1024, schema_)); |
| 442 | + fragments.push_back(std::make_shared<MockFragment>(2 * 1024, 2, 1024, schema_)); |
| 443 | + return MakeVectorIterator(std::move(fragments)); |
| 444 | + }; |
| 445 | +}; |
| 446 | + |
| 447 | +TEST_F(TestFileSystemDataset, MultiThreadedWritePersistsOrder) { |
| 448 | + // Test for GH-26818 |
| 449 | + // |
| 450 | + // This test uses std::this_thread::sleep_for to increase chances for batches |
| 451 | + // to get written out-of-order in multi-threaded environment. |
| 452 | + // With preserve_order = false, the existence of out-of-order is asserted to |
| 453 | + // verify that the test setup reliably writes out-of-order sequences, and |
| 454 | + // that write_options.preserve_order = preserve_order can recreate order. |
| 455 | + // |
| 456 | + // Estimates for out_of_order == false and preserve_order == false to occur |
| 457 | + // are 10^-62 https://github.com/apache/arrow/pull/44470#discussion_r2079049038 |
| 458 | + // |
| 459 | + // If this test starts to reliably fail with preserve_order == false, the test setup |
| 460 | + // has to be revised to again reliably produce out-of-order sequences. |
| 461 | + auto format = std::make_shared<IpcFileFormat>(); |
| 462 | + FileSystemDatasetWriteOptions write_options; |
| 463 | + write_options.file_write_options = format->DefaultWriteOptions(); |
| 464 | + write_options.base_dir = "root"; |
| 465 | + write_options.partitioning = std::make_shared<HivePartitioning>(schema({})); |
| 466 | + write_options.basename_template = "{i}.feather"; |
| 467 | + |
| 468 | + // The Mock dataset delays emitting the first fragment, which test sequenced output of |
| 469 | + // scan node |
| 470 | + auto dataset = std::make_shared<MockDataset>(schema({field("f0", int32())})); |
| 471 | + |
| 472 | + // The delay scalar function delays some batches of all fragments, which tests implicit |
| 473 | + // ordering |
| 474 | + auto delay_func = std::make_shared<compute::ScalarFunction>("delay", compute::Arity(1), |
| 475 | + compute::FunctionDoc()); |
| 476 | + compute::ScalarKernel delay_kernel; |
| 477 | + delay_kernel.exec = delay; |
| 478 | + delay_kernel.signature = compute::KernelSignature::Make({int32()}, boolean()); |
| 479 | + ASSERT_OK(delay_func->AddKernel(delay_kernel)); |
| 480 | + ASSERT_OK(compute::GetFunctionRegistry()->AddFunction(delay_func)); |
| 481 | + |
| 482 | + for (bool preserve_order : {true, false}) { |
| 483 | + ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); |
| 484 | + ASSERT_OK(scanner_builder->UseThreads(true)); |
| 485 | + ASSERT_OK( |
| 486 | + scanner_builder->Filter(compute::call("delay", {compute::field_ref("f0")}))); |
| 487 | + ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); |
| 488 | + |
| 489 | + auto fs = std::make_shared<fs::internal::MockFileSystem>(fs::kNoTime); |
| 490 | + write_options.filesystem = fs; |
| 491 | + write_options.preserve_order = preserve_order; |
| 492 | + |
| 493 | + ASSERT_OK(FileSystemDataset::Write(write_options, scanner)); |
| 494 | + |
| 495 | + // Read the file back out and verify the order |
| 496 | + ASSERT_OK_AND_ASSIGN(auto dataset_factory, FileSystemDatasetFactory::Make( |
| 497 | + fs, {"root/0.feather"}, format, {})); |
| 498 | + ASSERT_OK_AND_ASSIGN(auto written_dataset, dataset_factory->Finish(FinishOptions{})); |
| 499 | + ASSERT_OK_AND_ASSIGN(scanner_builder, written_dataset->NewScan()); |
| 500 | + ASSERT_OK(scanner_builder->UseThreads(false)); |
| 501 | + ASSERT_OK_AND_ASSIGN(scanner, scanner_builder->Finish()); |
| 502 | + ASSERT_OK_AND_ASSIGN(auto actual, scanner->ToTable()); |
| 503 | + TableBatchReader reader(*actual); |
| 504 | + std::shared_ptr<RecordBatch> batch; |
| 505 | + ASSERT_OK(reader.ReadNext(&batch)); |
| 506 | + int32_t prev = -1; |
| 507 | + auto out_of_order = false; |
| 508 | + while (batch != nullptr) { |
| 509 | + const auto* values = batch->column(0)->data()->GetValues<int32_t>(1); |
| 510 | + for (int row = 0; row < batch->num_rows(); ++row) { |
| 511 | + int32_t value = values[row]; |
| 512 | + if (value <= prev) { |
| 513 | + out_of_order = true; |
| 514 | + } |
| 515 | + prev = value; |
| 516 | + } |
| 517 | + ASSERT_OK(reader.ReadNext(&batch)); |
| 518 | + } |
| 519 | + ASSERT_EQ(!out_of_order, preserve_order); |
| 520 | + } |
| 521 | +} |
| 522 | + |
356 | 523 | class FileSystemWriteTest : public testing::TestWithParam<std::tuple<bool, bool>> { |
357 | 524 | using PlanFactory = std::function<std::vector<acero::Declaration>( |
358 | 525 | const FileSystemDatasetWriteOptions&, |
|
0 commit comments