Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions ydb/library/workload/vector/configure_opts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@

namespace NYdbWorkload::NVector {

void ConfigureTableOpts(NLastGetopt::TOpts& opts, TTableOpts* tableOpts) {
opts.AddLongOption( "table", "Table name")
.RequiredArgument("NAME")
.DefaultValue(tableOpts->Name)
.StoreResult(&tableOpts->Name);
}

void ConfigureVectorOpts(NLastGetopt::TOpts& opts, TVectorOpts* vectorOpts) {
NColorizer::TColors colors = NYdb::NConsoleClient::AutoColors(Cout);

Expand Down
14 changes: 14 additions & 0 deletions ydb/library/workload/vector/configure_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,25 @@

namespace NYdbWorkload::NVector {

struct TTableOpts {
TString Name = "vector_index_workload";
};

struct TTablePartitioningOpts {
size_t MinPartitions = 40;
size_t PartitionSize = 2000;
bool AutoPartitioningByLoad = true;
};

struct TVectorOpts {
TString VectorType = "float";
size_t VectorDimension = 1024;
};

void ConfigureTableOpts(NLastGetopt::TOpts& opts, TTableOpts* tableOpts);

void ConfigureTablePartitioningOpts(NLastGetopt::TOpts& opts, TTablePartitioningOpts* partitioningOpts);

void ConfigureVectorOpts(NLastGetopt::TOpts& opts, TVectorOpts* vectorOpts);

}
4 changes: 2 additions & 2 deletions ydb/library/workload/vector/vector_command_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ int TWorkloadCommandBuildIndex::DoRun() {
);
)_",
Params.DbPath.c_str(),
Params.TableName.c_str(),
Params.TableOpts.Name.c_str(),
Params.IndexName.c_str(),
Params.Distance.c_str(),
Params.VectorOpts.VectorType.c_str(),
Expand Down Expand Up @@ -94,7 +94,7 @@ int TWorkloadCommandDropIndex::DoRun() {
DROP INDEX `{2}`;
)_",
Params.DbPath.c_str(),
Params.TableName.c_str(),
Params.TableOpts.Name.c_str(),
Params.IndexName.c_str()
);

Expand Down
8 changes: 4 additions & 4 deletions ydb/library/workload/vector/vector_data_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ class TRandomDataGenerator final: public IBulkDataGenerator {

public:
TRandomDataGenerator(const TVectorWorkloadParams& params, const NVector::TVectorOpts& vectorOpts, const size_t rowCount, const uint32_t randomSeed)
: IBulkDataGenerator(params.TableName, rowCount)
: IBulkDataGenerator(params.TableOpts.Name, rowCount)
, Params(params)
, VectorOpts(vectorOpts)
, RowCount(rowCount)
Expand Down Expand Up @@ -360,7 +360,7 @@ class TRandomDataGenerator final: public IBulkDataGenerator {
arrow::ipc::SerializeSchema(*schema).ValueOrDie()->ToString()
);

return {MakeIntrusive<TDataPortion>(Params.GetFullTableName(Params.TableName.c_str()), std::move(arrowData), currentBatchSize)};
return {MakeIntrusive<TDataPortion>(Params.GetFullTableName(Params.TableOpts.Name.c_str()), std::move(arrowData), currentBatchSize)};
}
}
};
Expand Down Expand Up @@ -402,9 +402,9 @@ void TWorkloadVectorFilesDataInitializer::ConfigureOpts(NLastGetopt::TOpts& opts
TBulkDataGeneratorList TWorkloadVectorFilesDataInitializer::DoGetBulkInitialData() {
const auto basicDataGenerator = std::make_shared<TDataGenerator>(
*this,
Params.TableName,
Params.TableOpts.Name,
0,
Params.TableName,
Params.TableOpts.Name,
DataFiles,
Params.GetColumns(),
TDataGenerator::EPortionSizeUnit::Line
Expand Down
2 changes: 1 addition & 1 deletion ydb/library/workload/vector/vector_recall_evaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ void TVectorRecallEvaluator::SelectReferenceResults(const TVectorSampler& sample
<< "SELECT s.id AS id"
<< ", " << (isAscending ? "BOTTOM_BY" : "TOP_BY") << "(" << MakeKeyExpression(Params, "m.") <<
", Knn::" << functionName << "(m." << Params.EmbeddingColumn << ", s.embedding), " << Params.Limit << ") result_ids"
<< " FROM " << Params.TableName << " m"
<< " FROM " << Params.TableOpts.Name << " m"
<< (Params.PrefixColumn ? " INNER JOIN " : " CROSS JOIN ") << "AS_TABLE($Samples) AS s";
if (Params.PrefixColumn) {
refQueryBuilder << " ON s.prefix = m." << *Params.PrefixColumn;
Expand Down
6 changes: 3 additions & 3 deletions ydb/library/workload/vector/vector_sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ TVectorSampler::TVectorSampler(const TVectorWorkloadParams& params)
ui64 TVectorSampler::SelectOneId(bool min) {
std::string query = std::format(R"_(--!syntax_v1
SELECT Unwrap(CAST({1} AS uint64)) AS id FROM {0} ORDER BY {1} {2} LIMIT 1;
)_", Params.TableName.c_str(), Params.KeyColumns[0].c_str(), min ? "" : "DESC");
)_", Params.TableOpts.Name.c_str(), Params.KeyColumns[0].c_str(), min ? "" : "DESC");

// Execute the query
std::optional<NYdb::TResultSet> rangeResultSet;
Expand Down Expand Up @@ -148,13 +148,13 @@ void TVectorSampler::SampleExistingVectors() {
SELECT Unwrap(CAST({0} as uint64)) as id, Unwrap({1}) as embedding, Unwrap({2}) as prefix_value
FROM {3}
WHERE {0} = {4};
)_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.PrefixColumn->c_str(), Params.TableName.c_str(), randomId);
)_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.PrefixColumn->c_str(), Params.TableOpts.Name.c_str(), randomId);
} else {
vectorQuery = std::format(R"_(--!syntax_v1
SELECT Unwrap(CAST({0} as uint64)) as id, Unwrap({1}) as embedding
FROM {2}
WHERE {0} = {3};
)_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.TableName.c_str(), randomId);
)_", Params.KeyColumns[0].c_str(), Params.EmbeddingColumn.c_str(), Params.TableOpts.Name.c_str(), randomId);
}

std::optional<NYdb::TResultSet> vectorResultSet;
Expand Down
2 changes: 1 addition & 1 deletion ydb/library/workload/vector/vector_sql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ std::string MakeSelect(const TVectorWorkloadParams& params, const TString& index
if (params.PrefixColumn)
ret << "DECLARE $PrefixValue as " << params.PrefixType << ";" << "\n";
ret << "pragma ydb.KMeansTreeSearchTopSize=\"" << params.KmeansTreeSearchClusters << "\";" << "\n";
ret << "SELECT " << MakeKeyExpression(params, "") << " AS id FROM " << params.TableName << "\n";
ret << "SELECT " << MakeKeyExpression(params, "") << " AS id FROM " << params.TableOpts.Name << "\n";
if (!indexName.empty())
ret << "VIEW " << indexName << "\n";
if (params.PrefixColumn)
Expand Down
29 changes: 18 additions & 11 deletions ydb/library/workload/vector/vector_workload_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,24 +35,31 @@ void TVectorWorkloadGenerator::Init() {

std::string TVectorWorkloadGenerator::GetDDLQueries() const {
return std::format(R"_(--!syntax_v1
CREATE TABLE `{0}/{1}`(
id Uint64,
embedding String,
PRIMARY KEY(id))
WITH (
AUTO_PARTITIONING_BY_SIZE = ENABLED,
AUTO_PARTITIONING_PARTITION_SIZE_MB = 500,
AUTO_PARTITIONING_BY_LOAD = ENABLED
)
)_", Params.DbPath.c_str(), Params.TableName.c_str());
CREATE TABLE `{0}/{1}`(
id Uint64,
embedding String,
PRIMARY KEY(id))
WITH (
AUTO_PARTITIONING_BY_SIZE = ENABLED,
AUTO_PARTITIONING_BY_LOAD = {2},
AUTO_PARTITIONING_PARTITION_SIZE_MB = {3},
AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = {4}
)
)_",
Params.DbPath.c_str(),
Params.TableOpts.Name.c_str(),
Params.TablePartitioningOpts.AutoPartitioningByLoad ? "ENABLED" : "DISABLED",
Params.TablePartitioningOpts.PartitionSize,
Params.TablePartitioningOpts.MinPartitions
);
}

TQueryInfoList TVectorWorkloadGenerator::GetInitialData() {
return {};
}

TVector<std::string> TVectorWorkloadGenerator::GetCleanPaths() const {
return {"vector"};
return {Params.TableOpts.Name};
}

TQueryInfoList TVectorWorkloadGenerator::GetWorkload(int type) {
Expand Down
25 changes: 6 additions & 19 deletions ydb/library/workload/vector/vector_workload_params.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,6 @@
namespace NYdbWorkload {

void TVectorWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const ECommandType commandType, int workloadType) {
auto addInitParam = [&]() {
opts.AddLongOption( "rows", "Number of vectors to init the table")
.Required().StoreResult(&VectorInitCount);
opts.AddLongOption( "distance", "Distance/similarity function")
.Required().StoreResult(&Distance);
NVector::ConfigureVectorOpts(opts, &VectorOpts);
opts.AddLongOption( "kmeans-tree-levels", "Number of levels in the kmeans tree")
.Required().StoreResult(&KmeansTreeLevels);
opts.AddLongOption( "kmeans-tree-clusters", "Number of cluster in kmeans")
.Required().StoreResult(&KmeansTreeClusters);
};

auto addUpsertParam = [&]() {
};

Expand All @@ -50,8 +38,7 @@ void TVectorWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const EComma

switch (commandType) {
case TWorkloadParams::ECommandType::Init:
ConfigureCommonOpts(opts);
addInitParam();
NVector::ConfigureTableOpts(opts, &TableOpts);
break;
case TWorkloadParams::ECommandType::Import:
ConfigureCommonOpts(opts);
Expand All @@ -74,9 +61,9 @@ void TVectorWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const EComma

void TVectorWorkloadParams::ConfigureCommonOpts(NLastGetopt::TOpts& opts) {
opts.AddLongOption( "table", "Table name")
.DefaultValue("vector_index_workload").StoreResult(&TableName);
.DefaultValue(TableOpts.Name).StoreResult(&TableOpts.Name);
opts.AddLongOption( "index", "Index name")
.DefaultValue("index").StoreResult(&IndexName);
.DefaultValue(IndexName).StoreResult(&IndexName);
}

void TVectorWorkloadParams::ConfigureIndexOpts(NLastGetopt::TOpts& opts) {
Expand All @@ -100,7 +87,7 @@ TVector<TString> TVectorWorkloadParams::GetColumns() const {
}

void TVectorWorkloadParams::Init() {
const TString tablePath = GetFullTableName(TableName.c_str());
const TString tablePath = GetFullTableName(TableOpts.Name.c_str());

auto session = TableClient->GetSession().ExtractValueSync().GetSession();
auto describeTableResult = session.DescribeTable(tablePath,
Expand Down Expand Up @@ -153,12 +140,12 @@ void TVectorWorkloadParams::Init() {
if (!TableRowCount) {
TableRowCount = tableDescription.GetTableRows();
}
Y_ABORT_UNLESS(TableRowCount > 0, "Table %s is empty or statistics is not calculated yet", TableName.c_str());
Y_ABORT_UNLESS(TableRowCount > 0, "Table %s is empty or statistics is not calculated yet", TableOpts.Name.c_str());

// If we have fewer vectors than requested targets, adjust Params.Targets
Y_ABORT_UNLESS(TableRowCount >= Targets, "Requested more targets than row number in the dataset.");

Y_ABORT_UNLESS(indexFound, "Index %s not found in table %s", IndexName.c_str(), TableName.c_str());
Y_ABORT_UNLESS(indexFound, "Index %s not found in table %s", IndexName.c_str(), TableOpts.Name.c_str());

if (QueryTableName) {
const TString tablePath = GetFullTableName(QueryTableName.c_str());
Expand Down
9 changes: 6 additions & 3 deletions ydb/library/workload/vector/vector_workload_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,20 @@ class TVectorWorkloadParams final: public TWorkloadBaseParams {

TVector<TString> GetColumns() const;

TString TableName;
NVector::TTableOpts TableOpts;
NVector::TTablePartitioningOpts TablePartitioningOpts;
NVector::TVectorOpts VectorOpts;

TString IndexName = "index";

TString QueryTableName;
TString IndexName;
std::vector<std::string> KeyColumns;
std::string EmbeddingColumn;
std::vector<std::string> QueryTableKeyColumns;
std::optional<std::string> PrefixColumn;
std::optional<std::string> PrefixType;
NYdb::NTable::TVectorIndexSettings::EMetric Metric;
TString Distance;
NVector::TVectorOpts VectorOpts;
size_t KmeansTreeLevels = 0;
size_t KmeansTreeClusters = 0;
size_t Targets = 0;
Expand Down
Loading