diff --git a/ydb/library/workload/vector/configure_opts.cpp b/ydb/library/workload/vector/configure_opts.cpp index d1126c9f9afe..2120005b73f0 100644 --- a/ydb/library/workload/vector/configure_opts.cpp +++ b/ydb/library/workload/vector/configure_opts.cpp @@ -4,6 +4,13 @@ namespace NYdbWorkload::NVector { +void ConfigureTableOpts(NLastGetopt::TOpts& opts, TTableOpts* tableOpts) { + opts.AddLongOption( "table", "Table name") + .RequiredArgument("NAME") + .DefaultValue(tableOpts->Name) + .StoreResult(&tableOpts->Name); +} + void ConfigureVectorOpts(NLastGetopt::TOpts& opts, TVectorOpts* vectorOpts) { NColorizer::TColors colors = NYdb::NConsoleClient::AutoColors(Cout); diff --git a/ydb/library/workload/vector/configure_opts.h b/ydb/library/workload/vector/configure_opts.h index dab87efd988c..c4c849d6eaa5 100644 --- a/ydb/library/workload/vector/configure_opts.h +++ b/ydb/library/workload/vector/configure_opts.h @@ -6,11 +6,25 @@ namespace NYdbWorkload::NVector { +struct TTableOpts { + TString Name = "vector_index_workload"; +}; + +struct TTablePartitioningOpts { + size_t MinPartitions = 40; + size_t PartitionSize = 2000; + bool AutoPartitioningByLoad = true; +}; + struct TVectorOpts { TString VectorType = "float"; size_t VectorDimension = 1024; }; +void ConfigureTableOpts(NLastGetopt::TOpts& opts, TTableOpts* tableOpts); + +void ConfigureTablePartitioningOpts(NLastGetopt::TOpts& opts, TTablePartitioningOpts* partitioningOpts); + void ConfigureVectorOpts(NLastGetopt::TOpts& opts, TVectorOpts* vectorOpts); } diff --git a/ydb/library/workload/vector/vector_command_index.cpp b/ydb/library/workload/vector/vector_command_index.cpp index c63e81ce7127..c54471185cfd 100644 --- a/ydb/library/workload/vector/vector_command_index.cpp +++ b/ydb/library/workload/vector/vector_command_index.cpp @@ -62,7 +62,7 @@ int TWorkloadCommandBuildIndex::DoRun() { ); )_", Params.DbPath.c_str(), - Params.TableName.c_str(), + Params.TableOpts.Name.c_str(), Params.IndexName.c_str(), Params.Distance.c_str(), Params.VectorOpts.VectorType.c_str(), @@ -94,7 +94,7 @@ int TWorkloadCommandDropIndex::DoRun() { DROP INDEX `{2}`; )_", Params.DbPath.c_str(), - Params.TableName.c_str(), + Params.TableOpts.Name.c_str(), Params.IndexName.c_str() ); diff --git a/ydb/library/workload/vector/vector_data_generator.cpp b/ydb/library/workload/vector/vector_data_generator.cpp index bbdf66bc8f36..9b2e6c56d915 100644 --- a/ydb/library/workload/vector/vector_data_generator.cpp +++ b/ydb/library/workload/vector/vector_data_generator.cpp @@ -289,7 +289,7 @@ class TRandomDataGenerator final: public IBulkDataGenerator { public: TRandomDataGenerator(const TVectorWorkloadParams& params, const NVector::TVectorOpts& vectorOpts, const size_t rowCount, const uint32_t randomSeed) - : IBulkDataGenerator(params.TableName, rowCount) + : IBulkDataGenerator(params.TableOpts.Name, rowCount) , Params(params) , VectorOpts(vectorOpts) , RowCount(rowCount) @@ -360,7 +360,7 @@ class TRandomDataGenerator final: public IBulkDataGenerator { arrow::ipc::SerializeSchema(*schema).ValueOrDie()->ToString() ); - return {MakeIntrusive(Params.GetFullTableName(Params.TableName.c_str()), std::move(arrowData), currentBatchSize)}; + return {MakeIntrusive(Params.GetFullTableName(Params.TableOpts.Name.c_str()), std::move(arrowData), currentBatchSize)}; } } }; @@ -402,9 +402,9 @@ void TWorkloadVectorFilesDataInitializer::ConfigureOpts(NLastGetopt::TOpts& opts TBulkDataGeneratorList TWorkloadVectorFilesDataInitializer::DoGetBulkInitialData() { const auto basicDataGenerator = std::make_shared( *this, - Params.TableName, + Params.TableOpts.Name, 0, - Params.TableName, + Params.TableOpts.Name, DataFiles, Params.GetColumns(), TDataGenerator::EPortionSizeUnit::Line diff --git a/ydb/library/workload/vector/vector_recall_evaluator.cpp b/ydb/library/workload/vector/vector_recall_evaluator.cpp index d5b8d5c47a81..7733700813e7 100644 --- a/ydb/library/workload/vector/vector_recall_evaluator.cpp +++ b/ydb/library/workload/vector/vector_recall_evaluator.cpp @@ -39,7 +39,7 @@ void TVectorRecallEvaluator::SelectReferenceResults(const TVectorSampler& sample << "SELECT s.id AS id" << ", " << (isAscending ? "BOTTOM_BY" : "TOP_BY") << "(" << MakeKeyExpression(Params, "m.") << ", Knn::" << functionName << "(m." << Params.EmbeddingColumn << ", s.embedding), " << Params.Limit << ") result_ids" - << " FROM " << Params.TableName << " m" + << " FROM " << Params.TableOpts.Name << " m" << (Params.PrefixColumn ? " INNER JOIN " : " CROSS JOIN ") << "AS_TABLE($Samples) AS s"; if (Params.PrefixColumn) { refQueryBuilder << " ON s.prefix = m." << *Params.PrefixColumn; diff --git a/ydb/library/workload/vector/vector_sampler.cpp b/ydb/library/workload/vector/vector_sampler.cpp index 6c18a30461bb..74e1cbc5a08f 100644 --- a/ydb/library/workload/vector/vector_sampler.cpp +++ b/ydb/library/workload/vector/vector_sampler.cpp @@ -20,7 +20,7 @@ TVectorSampler::TVectorSampler(const TVectorWorkloadParams& params) ui64 TVectorSampler::SelectOneId(bool min) { std::string query = std::format(R"_(--!syntax_v1 SELECT Unwrap(CAST({1} AS uint64)) AS id FROM {0} ORDER BY {1} {2} LIMIT 1; - )_", Params.TableName.c_str(), Params.KeyColumns[0].c_str(), min ? "" : "DESC"); + )_", Params.TableOpts.Name.c_str(), Params.KeyColumns[0].c_str(), min ? "" : "DESC"); // Execute the query std::optional rangeResultSet; @@ -148,13 +148,13 @@ void TVectorSampler::SampleExistingVectors() { SELECT Unwrap(CAST({0} as uint64)) as id, Unwrap({1}) as embedding, Unwrap({2}) as prefix_value FROM {3} WHERE {0} = {4}; - )_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.PrefixColumn->c_str(), Params.TableName.c_str(), randomId); + )_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.PrefixColumn->c_str(), Params.TableOpts.Name.c_str(), randomId); } else { vectorQuery = std::format(R"_(--!syntax_v1 SELECT Unwrap(CAST({0} as uint64)) as id, Unwrap({1}) as embedding FROM {2} WHERE {0} = {3}; - )_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.TableName.c_str(), randomId); + )_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.TableOpts.Name.c_str(), randomId); } std::optional vectorResultSet; diff --git a/ydb/library/workload/vector/vector_sql.cpp b/ydb/library/workload/vector/vector_sql.cpp index ee250edd4a00..e406a2520f6b 100644 --- a/ydb/library/workload/vector/vector_sql.cpp +++ b/ydb/library/workload/vector/vector_sql.cpp @@ -64,7 +64,7 @@ std::string MakeSelect(const TVectorWorkloadParams& params, const TString& index if (params.PrefixColumn) ret << "DECLARE $PrefixValue as " << params.PrefixType << ";" << "\n"; ret << "pragma ydb.KMeansTreeSearchTopSize=\"" << params.KmeansTreeSearchClusters << "\";" << "\n"; - ret << "SELECT " << MakeKeyExpression(params, "") << " AS id FROM " << params.TableName << "\n"; + ret << "SELECT " << MakeKeyExpression(params, "") << " AS id FROM " << params.TableOpts.Name << "\n"; if (!indexName.empty()) ret << "VIEW " << indexName << "\n"; if (params.PrefixColumn) diff --git a/ydb/library/workload/vector/vector_workload_generator.cpp b/ydb/library/workload/vector/vector_workload_generator.cpp index 342544fb7dc7..b7cbac0dca02 100644 --- a/ydb/library/workload/vector/vector_workload_generator.cpp +++ b/ydb/library/workload/vector/vector_workload_generator.cpp @@ -35,16 +35,23 @@ void TVectorWorkloadGenerator::Init() { std::string TVectorWorkloadGenerator::GetDDLQueries() const { return std::format(R"_(--!syntax_v1 - CREATE TABLE `{0}/{1}`( - id Uint64, - embedding String, - PRIMARY KEY(id)) - WITH ( - AUTO_PARTITIONING_BY_SIZE = ENABLED, - AUTO_PARTITIONING_PARTITION_SIZE_MB = 500, - AUTO_PARTITIONING_BY_LOAD = ENABLED - ) - )_", Params.DbPath.c_str(), Params.TableName.c_str()); + CREATE TABLE `{0}/{1}`( + id Uint64, + embedding String, + PRIMARY KEY(id)) + WITH ( + AUTO_PARTITIONING_BY_SIZE = ENABLED, + AUTO_PARTITIONING_BY_LOAD = {2}, + AUTO_PARTITIONING_PARTITION_SIZE_MB = {3}, + AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = {4} + ) + )_", + Params.DbPath.c_str(), + Params.TableOpts.Name.c_str(), + Params.TablePartitioningOpts.AutoPartitioningByLoad ? "ENABLED" : "DISABLED", + Params.TablePartitioningOpts.PartitionSize, + Params.TablePartitioningOpts.MinPartitions + ); } TQueryInfoList TVectorWorkloadGenerator::GetInitialData() { @@ -52,7 +59,7 @@ TQueryInfoList TVectorWorkloadGenerator::GetInitialData() { } TVector TVectorWorkloadGenerator::GetCleanPaths() const { - return {"vector"}; + return {Params.TableOpts.Name}; } TQueryInfoList TVectorWorkloadGenerator::GetWorkload(int type) { diff --git a/ydb/library/workload/vector/vector_workload_params.cpp b/ydb/library/workload/vector/vector_workload_params.cpp index d5192fc34585..fe87a8dbd3a8 100644 --- a/ydb/library/workload/vector/vector_workload_params.cpp +++ b/ydb/library/workload/vector/vector_workload_params.cpp @@ -14,18 +14,6 @@ namespace NYdbWorkload { void TVectorWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const ECommandType commandType, int workloadType) { - auto addInitParam = [&]() { - opts.AddLongOption( "rows", "Number of vectors to init the table") - .Required().StoreResult(&VectorInitCount); - opts.AddLongOption( "distance", "Distance/similarity function") - .Required().StoreResult(&Distance); - NVector::ConfigureVectorOpts(opts, &VectorOpts); - opts.AddLongOption( "kmeans-tree-levels", "Number of levels in the kmeans tree") - .Required().StoreResult(&KmeansTreeLevels); - opts.AddLongOption( "kmeans-tree-clusters", "Number of cluster in kmeans") - .Required().StoreResult(&KmeansTreeClusters); - }; - auto addUpsertParam = [&]() { }; @@ -50,8 +38,7 @@ void TVectorWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const EComma switch (commandType) { case TWorkloadParams::ECommandType::Init: - ConfigureCommonOpts(opts); - addInitParam(); + NVector::ConfigureTableOpts(opts, &TableOpts); break; case TWorkloadParams::ECommandType::Import: ConfigureCommonOpts(opts); @@ -74,9 +61,9 @@ void TVectorWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const EComma void TVectorWorkloadParams::ConfigureCommonOpts(NLastGetopt::TOpts& opts) { opts.AddLongOption( "table", "Table name") - .DefaultValue("vector_index_workload").StoreResult(&TableName); + .DefaultValue(TableOpts.Name).StoreResult(&TableOpts.Name); opts.AddLongOption( "index", "Index name") - .DefaultValue("index").StoreResult(&IndexName); + .DefaultValue(IndexName).StoreResult(&IndexName); } void TVectorWorkloadParams::ConfigureIndexOpts(NLastGetopt::TOpts& opts) { @@ -100,7 +87,7 @@ TVector TVectorWorkloadParams::GetColumns() const { } void TVectorWorkloadParams::Init() { - const TString tablePath = GetFullTableName(TableName.c_str()); + const TString tablePath = GetFullTableName(TableOpts.Name.c_str()); auto session = TableClient->GetSession().ExtractValueSync().GetSession(); auto describeTableResult = session.DescribeTable(tablePath, @@ -153,12 +140,12 @@ void TVectorWorkloadParams::Init() { if (!TableRowCount) { TableRowCount = tableDescription.GetTableRows(); } - Y_ABORT_UNLESS(TableRowCount > 0, "Table %s is empty or statistics is not calculated yet", TableName.c_str()); + Y_ABORT_UNLESS(TableRowCount > 0, "Table %s is empty or statistics is not calculated yet", TableOpts.Name.c_str()); // If we have fewer vectors than requested targets, adjust Params.Targets Y_ABORT_UNLESS(TableRowCount >= Targets, "Requested more targets than row number in the dataset."); - Y_ABORT_UNLESS(indexFound, "Index %s not found in table %s", IndexName.c_str(), TableName.c_str()); + Y_ABORT_UNLESS(indexFound, "Index %s not found in table %s", IndexName.c_str(), TableOpts.Name.c_str()); if (QueryTableName) { const TString tablePath = GetFullTableName(QueryTableName.c_str()); diff --git a/ydb/library/workload/vector/vector_workload_params.h b/ydb/library/workload/vector/vector_workload_params.h index d26843616c18..e74c4314b0b8 100644 --- a/ydb/library/workload/vector/vector_workload_params.h +++ b/ydb/library/workload/vector/vector_workload_params.h @@ -31,9 +31,13 @@ class TVectorWorkloadParams final: public TWorkloadBaseParams { TVector GetColumns() const; - TString TableName; + NVector::TTableOpts TableOpts; + NVector::TTablePartitioningOpts TablePartitioningOpts; + NVector::TVectorOpts VectorOpts; + + TString IndexName = "index"; + TString QueryTableName; - TString IndexName; std::vector KeyColumns; std::string EmbeddingColumn; std::vector QueryTableKeyColumns; @@ -41,7 +45,6 @@ class TVectorWorkloadParams final: public TWorkloadBaseParams { std::optional PrefixType; NYdb::NTable::TVectorIndexSettings::EMetric Metric; TString Distance; - NVector::TVectorOpts VectorOpts; size_t KmeansTreeLevels = 0; size_t KmeansTreeClusters = 0; size_t Targets = 0;