Skip to content

Commit abdeeb4

Browse files
xiaoxmengfacebook-github-bot
authored andcommitted
feat: Add FlatBuffers schemas for dense index and bloom filter
Summary: CONTEXT: Nimble needs a hash-based dense index for row-level point lookups. The index metadata is serialized using FlatBuffers. WHAT: Add FlatBuffers schema definitions for the dense index: - BloomFilter.fbs: Extracted into a separate file for reuse by other index types - DenseIndex.fbs: DenseIndexDirectory, DenseIndexSection, DenseIndex, and DenseIndexPartition tables for the hash-based index structure - BUCK/CMakeLists.txt: Build rules for the new .fbs files Reviewed By: tanjialiang Differential Revision: D99569601
1 parent b413698 commit abdeeb4

3 files changed

Lines changed: 165 additions & 0 deletions

File tree

dwio/nimble/tablet/BloomFilter.fbs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
namespace facebook.nimble.serialization;
18+
19+
/// Split block bloom filter for fast negative lookups.
20+
/// Uses 256-bit blocks with multiple hash probes per block,
21+
/// following the Parquet bloom filter design for cache efficiency.
22+
table BloomFilter {
23+
/// Number of 256-bit blocks in the filter.
24+
num_blocks:uint32;
25+
/// Bits per key used during construction (for diagnostics/rebuild).
26+
bits_per_key:float;
27+
/// Raw filter data. Size = num_blocks * 32 bytes.
28+
data:[uint8];
29+
}

dwio/nimble/tablet/CMakeLists.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,38 @@ target_include_directories(
6464
)
6565
add_dependencies(nimble_chunk_index_fb nimble_chunk_index_schema_fb)
6666

67+
build_flatbuffers(
68+
"${CMAKE_CURRENT_SOURCE_DIR}/BloomFilter.fbs"
69+
""
70+
nimble_bloom_filter_schema_fb
71+
""
72+
"${CMAKE_CURRENT_BINARY_DIR}"
73+
""
74+
""
75+
)
76+
add_library(nimble_bloom_filter_fb INTERFACE)
77+
target_include_directories(
78+
nimble_bloom_filter_fb
79+
INTERFACE ${PROJECT_BINARY_DIR} ${FLATBUFFERS_INCLUDE_DIR}
80+
)
81+
add_dependencies(nimble_bloom_filter_fb nimble_bloom_filter_schema_fb)
82+
83+
build_flatbuffers(
84+
"${CMAKE_CURRENT_SOURCE_DIR}/DenseIndex.fbs"
85+
"${CMAKE_CURRENT_SOURCE_DIR}"
86+
nimble_dense_index_schema_fb
87+
""
88+
"${CMAKE_CURRENT_BINARY_DIR}"
89+
""
90+
""
91+
)
92+
add_library(nimble_dense_index_fb INTERFACE)
93+
target_include_directories(
94+
nimble_dense_index_fb
95+
INTERFACE ${PROJECT_BINARY_DIR} ${FLATBUFFERS_INCLUDE_DIR}
96+
)
97+
add_dependencies(nimble_dense_index_fb nimble_dense_index_schema_fb)
98+
6799
add_library(nimble_tablet_common Compression.cpp MetadataBuffer.cpp)
68100
target_link_libraries(
69101
nimble_tablet_common

dwio/nimble/tablet/DenseIndex.fbs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
include "BloomFilter.fbs";
18+
include "Footer.fbs";
19+
20+
namespace facebook.nimble.serialization;
21+
22+
/// A single dense hash index on a set of composite key columns.
23+
/// Uses a two-level hash table: bucket directory + key arrays.
24+
///
25+
/// Lookup flow:
26+
/// 1. Optional bloom filter check for fast negative result
27+
/// 2. Hash(encodedKey) mod num_buckets → bucket
28+
/// 3. Scan keys in bucket by exact key match
29+
/// 4. Return matching row numbers
30+
table DenseIndex {
31+
/// Names of columns forming the composite key.
32+
index_columns:[string];
33+
34+
/// Total number of rows indexed.
35+
row_count:uint64;
36+
37+
/// Number of buckets in the hash table (always power of 2).
38+
num_buckets:uint32;
39+
/// Total number of keys in the index. Diagnostic only; not used on the
40+
/// read path. Equal to row_count when there is no deduplication.
41+
num_keys:uint64;
42+
/// Load factor used during construction (for diagnostics).
43+
load_factor:float;
44+
45+
/// Accumulated row counts per stripe (prefix sum).
46+
/// Enables translating global row number to (stripe, stripe-local row).
47+
/// Size = stripe_count.
48+
stripe_row_counts:[uint32];
49+
50+
/// Optional bloom filter for fast negative lookups.
51+
bloom_filter:BloomFilter;
52+
53+
/// Partitions. The hash table is always split into one or more partitions,
54+
/// each stored as a separate DenseIndexPartition section. Small indices
55+
/// use a single partition; large indices are split by maxPartitionSizeBytes.
56+
/// Starting bucket index in the global bucket space (inclusive).
57+
/// Size = partition_count.
58+
partition_start_buckets:[uint32];
59+
/// Number of buckets in each partition.
60+
/// Size = partition_count.
61+
partition_bucket_counts:[uint32];
62+
/// References to the serialized DenseIndexPartition for each partition.
63+
/// Size = partition_count.
64+
partition_sections:[MetadataSection];
65+
}
66+
67+
/// Data for a single partition of a dense index.
68+
/// Stored as a separate section, loaded on-demand during lookup.
69+
table DenseIndexPartition {
70+
/// Bucket directory for this partition. Indices are relative to the
71+
/// partition's key arrays (encoded_keys and row_numbers).
72+
bucket_offsets:[uint32];
73+
/// Number of keys in each bucket.
74+
bucket_counts:[uint16];
75+
/// Binary-encoded composite keys for this partition. Aligned with
76+
/// row_numbers: encoded_keys[i] corresponds to row_numbers[i].
77+
encoded_keys:[string];
78+
/// Global row numbers for this partition. Aligned with encoded_keys.
79+
row_numbers:[uint32];
80+
}
81+
82+
/// Lightweight descriptor for a dense index.
83+
/// Stores the index columns for lookup dispatch and a MetadataSection
84+
/// pointing to the serialized DenseIndex data.
85+
table DenseIndexSection {
86+
/// Names of columns forming the composite key.
87+
/// Duplicated here so the reader can find the right index
88+
/// without loading the full DenseIndex data.
89+
index_columns:[string];
90+
/// Reference to the serialized DenseIndex stored as a
91+
/// separate section in the file.
92+
section:MetadataSection;
93+
}
94+
95+
/// Root table for the dense index optional section.
96+
/// Supports multiple independent dense indices per file,
97+
/// each on a different set of composite key columns.
98+
/// Stored in optional section "columnar.dense.index".
99+
table DenseIndexDirectory {
100+
/// One descriptor per dense index defined on the file.
101+
indices:[DenseIndexSection];
102+
}
103+
104+
root_type DenseIndexDirectory;

0 commit comments

Comments
 (0)