From 22c9ddb33135232319318fea3b6b1151fca4e2bf Mon Sep 17 00:00:00 2001 From: ActivePeter <1020401660@qq.com> Date: Thu, 2 Jul 2026 19:03:56 +0800 Subject: [PATCH] ssd --- ...30\345\202\250\350\256\276\350\256\241.md" | 836 +++++++ fluxon_rs/Cargo.lock | 12 + fluxon_rs/fluxon_kv/Cargo.toml | 1 + fluxon_rs/fluxon_kv/src/client_kv_api/get.rs | 117 +- fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs | 357 ++- .../fluxon_kv/src/client_kv_api/msg_pack.rs | 72 + .../fluxon_kv/src/client_seg_pool/mod.rs | 13 +- fluxon_rs/fluxon_kv/src/config.rs | 199 +- .../external_client_test.rs | 2 + .../fluxon_kv/src/external_client_api/mod.rs | 3 +- fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs | 2159 +++++++++++++++++ fluxon_rs/fluxon_kv/src/kv_test.rs | 349 ++- fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs | 1 + fluxon_rs/fluxon_kv/src/lib.rs | 27 +- .../fluxon_kv/src/master_kv_router/delete.rs | 2 +- .../fluxon_kv/src/master_kv_router/get.rs | 310 ++- .../fluxon_kv/src/master_kv_router/mod.rs | 41 +- .../src/master_kv_router/msg_pack.rs | 40 + .../fluxon_kv/src/master_kv_router/put.rs | 191 +- .../lease_manager_test.rs | 12 +- .../fluxon_kv/src/memholder/memholder_test.rs | 2 + .../rpcresp_kvresult_convert/msg_and_error.rs | 2 + .../rpcresp_kvresult_convert.rs | 33 +- 23 files changed, 4713 insertions(+), 68 deletions(-) create mode 100644 "fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" create mode 100644 fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs diff --git "a/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" new file mode 100644 index 0000000..d0da8a6 --- /dev/null +++ "b/fluxon_doc_cn/design/kv_5_SSD\345\255\230\345\202\250\350\256\276\350\256\241.md" @@ -0,0 +1,836 @@ +# KV 设计 5 - SSD 存储 + +## 稳定结论 + +当前 KV 的 SSD 存储应当是分布式 owner SSD 副本层。每个 owner 可以持有本地 SSD backing tier,master 在同一条 key-version 路由里分开记录内存 owner 副本和 SSD owner 副本;它和 CPU segment 内存副本共享 owner placement、allocation、transfer engine 和 `MemHolder` 生命周期。SSD 只承担可回填的数据源,不暴露第二套用户读写 API。 + +读路径按内存优先。master 找不到可用内存副本时,可以选择任意 owner 上的 SSD 副本;SSD owner 把磁盘数据按 chunk 读入自己节点上的 CPU staging allocation,每个 chunk ready 后立即由 SSD owner 侧复用现有 transfer engine push 到请求方 target allocation 对应 offset。SSD source 路径由 SSD owner 在所有 chunk transfer 完成后直接向 master 发送 `GetDoneReq`,再把 holder 结果放进 `SsdStageReadResp` 回给请求方;请求方不再从 SSD owner staging 发起第二段 transfer,也不再在 SSD source 路径上单独发送 `GetDoneReq`。 + +IO 层吸收 Pegaflow SSD cache 的核心做法:分片环形文件、`O_DIRECT` 对齐 buffer、`io_uring` 后台读写线程、有界读写队列、`Writing/Committed` 两阶段索引,以及 ring tail 推进时的主动失效。进一步对照 3FS 和 foyer 后,当前实现把底层 uring 调度改成 read/write 独立队列,并在同一 shard 内按 inflight 比例优先补读队列,避免 kvcache 回填读被持续写入压住;同时按 3FS 的位置生命周期约束保护正在读或正在写的 ring 位置,tail 推进不能覆盖 active IO。对大 payload 高带宽场景,aligned SSD stage 可以直接 readv 到 source staging allocation,跳过中间 aligned buffer 到 staging 的额外内存拷贝;SSD read 和 transfer 之间使用 producer/consumer pipeline,chunk read 完成即可发起对应 chunk transfer。owner 启动时从 `large_file_paths` 派生 SSD root,先按 `metadata.dev()` 去重,再为每个有效 device 建独立 writer/reader queue 和 `UringIoEngine`;shard 只在所属 device worker 的 shard 集内分配。写路径必须把内存提交和 SSD 提交拆开:内存 `PutDone` 先让 key-version 可读,SSD 写入完成后再通过独立 commit 把同一版本加入 `ssd_replicas`。 + +## 公共契约 + +公共配置只有一个 owner-only 字段: + +```yaml +fluxonkv_spec: + large_file_paths: [/data/fluxon_large] + ssd_storage: + max_bytes: 4294967296 +``` + +规则: + +- `ssd_storage` 缺省或为 `null` 时不启用 SSD。 +- `max_bytes` 必须大于或等于 512 bytes,满足当前 `O_DIRECT` 对齐约束。 +- zero-contribution external 禁止声明 `ssd_storage`;external 只能通过 owner 的 mmap、RPC 和 transfer surface 访问 SSD 回填结果。 +- 实际目录为每个可用 `large_file_root` 下的 `_cluster_kv_ssd_storage//`;owner 启动时创建目录并读取 `metadata.dev()`,同一个 device 只保留第一个 root,避免多个路径指向同一块盘时制造虚假的 IO 并行度。 +- 用户侧 `put/get/delete` API 不因 SSD 增加新入口;SSD 副本是 master 路由内部能力。 + +## 范围边界 + +| 范围 | 当前结论 | +| --- | --- | +| 分布式 SSD 读取 | 已接入。`GetStart` 可以返回任意 SSD owner,source staging allocation 位于 SSD owner,target allocation 位于请求方 owner。 | +| owner 内部多 SSD 路径 | 已接入。SSD root 来自 `large_file_paths`,先按 device 去重;每个有效 device 有独立 writer/reader queue、uring engine 和 shard 集,单 owner 可以利用多块真实本地 SSD。 | +| 内存 KV 复用 | 已复用。SSD 回填由 SSD owner 侧调用 `transfer_data_no_copy` 按 chunk push 到 requester target,全部 chunk transfer 完成后由 SSD owner 调 master `get_done`;requester 只复用返回的 holder 结果构造 `MemHolder`。 | +| Pegaflow IO 模型 | 已接入核心形态:分片 ring、`O_DIRECT`、`io_uring`、有界队列、两阶段提交、tail 失效;写路径已经把内存 `PutDone` 和 SSD commit 拆开。 | +| 3FS 位置生命周期 | 已接入到 SSD ring。读 IO 提交前 pin committed entry;未完成的 `Writing` entry 和 pinned read entry 都会阻止物理位置复用。 | +| 大 payload direct stage | 已接入 aligned fast path 和 chunk pipeline。master 给 SSD source staging 多分配最多 511 bytes,并在 allocation 内返回 512-byte 对齐后的 `src_addr`;SSD read 按 chunk 对齐 IO 长度直接写入 staging,chunk ready 后立刻 transfer,`MemHolder` 仍只使用真实 payload 长度。 | +| 冷启动恢复 | 当前不扫描 SSD shard 重建 master 路由;路由仍由本轮运行时的 `put/get/delete` 生命周期产生。 | +| lease key 专门治理 | 当前没有单独的 lease SSD 生命周期策略;lease 与普通 key 共用 key-version 路由约束。 | +| 独立 SSD 路径参数 | 不提供。SSD 根目录从 `large_file_paths` 派生,避免和日志、共享 bundle、FS disk cache 混用。 | + +## 数据流 + +```mermaid +flowchart TD + A["owner put target allocation"] --> B["write bytes into owner mmap"] + B --> G["owner -> master PutDone(memory_ready)"] + G --> H["master route: nodes_replicas"] + B --> C["async KvSsdStorage.persist_from_addr(key, put_id, addr, len)"] + C --> D["SSD writer queue"] + D --> E["io_uring writev to sharded O_DIRECT ring"] + E --> F["commit index: Writing -> Committed"] + F --> I["owner -> master SsdReplicaCommit"] + I --> J["master route: ssd_replicas"] + + J["get_start"] --> K{"live memory replica?"} + K -->|yes| L["return GetSourceKind::Memory"] + L --> M["existing transfer path"] + + K -->|no| N{"live SSD replica?"} + N -->|yes| O["allocate source staging on SSD owner"] + O --> P["allocate target on requester"] + P --> Q["return GetSourceKind::Ssd"] + Q --> R["SSD owner chunk readv into source staging"] + R --> S["SsdLoadedChunk(offset,len)"] + S --> W["SSD owner transfer chunk: staging+offset -> requester target+offset"] + W --> T["all chunks done: SSD owner -> master GetDoneReq"] + T --> V["SsdStageReadResp carries GetDoneResp fields"] + + N -->|no| U["KeyNotFound"] +``` + +## 端到端调用时序 + +SSD 路径只在两个位置扩展主链路:`put_done` 提交内存副本后,owner 异步把本地 target allocation 落到 SSD,并在完成后单独提交 SSD 副本;`get_start` 找不到可用内存副本时,master 为 SSD owner 分配 source staging,再由 SSD owner 按 chunk 把磁盘数据读入 staging 并 push 到 requester target。`get_done` 和 holder 生命周期继续走内存 KV 的原 master 逻辑,但 SSD source 路径的 `GetDoneReq` 由 SSD owner 在全部 chunk transfer 完成后发起,requester 只消费 `SsdStageReadResp` 里带回的 done 结果。 + +```mermaid +sequenceDiagram + participant C as requester owner + participant M as master + participant SO as SSD owner + participant TE as transfer engine + participant SSD as owner SSD shard + + C->>M: PutStartReq(key, len) + M-->>C: PutStartResp(target allocation) + Note over C: payload 写入 target allocation + C->>M: PutDoneReq(memory_ready) + Note right of M: nodes_replicas 写入内存副本\nkey-version 立即可读\nspawn post_put_ssd_replica_persist + M-->>C: PutDoneResp + M->>C: async SsdReplicaPersistReq(key, put_id, target_addr, len) + C->>SSD: KvSsdStorage.persist_from_addr(...) + Note over SSD: writer queue -> io_uring writev -> Writing/Committed + C->>M: SsdReplicaCommitReq(key, put_id, node_id, len) + Note right of M: ssd_replicas 写入 SSD 副本 + + C->>M: GetStartReq(key) + alt live memory replica exists + M-->>C: GetStartResp(source_kind=Memory, src_addr, target_addr) + else only SSD replica exists + Note right of M: 在 SSD owner CPU segment 分配 source_allocation\n在 requester CPU segment 分配 target allocation + M-->>C: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) + C->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) + SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) + loop each ready chunk + SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) + SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) + end + SO->>M: GetDoneReq(get_id) + Note right of M: target allocation 进入 get_holding\nsource_allocation 释放 + M-->>SO: GetDoneResp(holder_id) + SO-->>C: SsdStageReadResp(done_holder_id, done_allocation_mode) + end + opt source_kind=Memory + C->>TE: transfer_data_no_copy(read, src_addr -> target_addr, len) + C->>M: GetDoneReq(get_id) + Note right of M: target allocation 进入 get_holding + M-->>C: GetDoneResp(holder_id) + end +``` + +## 当前实现 + +| 模块 | 职责 | +| --- | --- | +| `fluxon_kv/src/config.rs` | 解析 `fluxonkv_spec.ssd_storage.max_bytes`,禁止 external 声明该字段,派生 SSD 根目录。 | +| `fluxon_kv/src/kv_ssd_storage.rs` | owner 内部 SSD cache。使用 shard 文件、`O_DIRECT`、`io_uring`、有界读写队列和两阶段索引管理 key-version bytes。 | +| `client_kv_api/put.rs` | owner 是最终 target 时,先通过 `PutDoneReq` 提交内存副本;SSD persist 由 master 的后台 `SsdReplicaPersistReq` 触发,owner 完成本地落盘后再通过独立 SSD commit 上报。 | +| `client_kv_api/get.rs` | `GetSourceKind::Ssd` 时,请求方让 SSD owner stage、push 并完成 `get_done`;stage RPC 成功后跳过请求方 transfer,也跳过请求方 `get_done`。 | +| `client_kv_api/msg_pack.rs` | 定义 `SsdStageReadReq/SsdStageReadResp` 和 `SsdReplicaPersistReq/SsdReplicaPersistResp`,分别用于 SSD stage 读、回传 done 结果,以及 master 触发 owner 本地 SSD persist。 | +| `master_kv_router/put.rs` | `put_done` 只提交内存副本,随后异步发起 `SsdReplicaPersistReq`;`SsdReplicaCommitReq` 单独写 `ssd_replicas`。 | +| `master_kv_router/get.rs` | 内存副本优先;无内存副本时从 `ssd_replicas` 中选择可用 owner,分配 source staging 和 requester target。 | +| `master_kv_router/delete.rs` | 内存副本被驱逐时,如果同 key-version 仍有 SSD 副本,保留 `kv_routes`。 | + +## 接口里的角色分工 + +SSD 逻辑按接口看最清楚:`put` 先让一个 key-version 的内存副本 ready,再异步补交 SSD 副本;`get` 决定读请求先走内存副本还是 SSD fallback。每个接口里再分 master、owner、external 三个角色看状态归属。 + +### put + +```mermaid +sequenceDiagram + participant E as external + participant O as owner + participant M as master + participant SSD as owner SSD store + + E->>O: ExternalPutStartReq(key, len) + O->>M: PutStartReq(key, len) + Note right of M: 分配 put_id 和 src/target allocation\n记录 inflight_puts + M-->>O: PutStartResp(put_id, src_addr, target_addr) + O-->>E: ExternalPutStartResp(offsets, put_id) + + Note over E,O: external 写 owner mmap/staging + E->>O: ExternalPutTransferEndReq(put_id) + O->>O: transfer_data_no_copy if remote target + O->>M: PutDoneReq(memory_ready) + Note right of M: 写 nodes_replicas\nkey-version 立即可读\nspawn post_put_ssd_replica_persist + M-->>O: PutDoneResp + O-->>E: ExternalPutTransferEndResp + M->>O: async SsdReplicaPersistReq(key, put_id, target_addr, len) + O->>SSD: persist_from_addr(key, put_id, target_addr, len) + Note over SSD: device write_tx -> per-device ssd_writer_loop -> io_uring writev\nWriting -> Committed + O->>M: SsdReplicaCommitReq(key, put_id, node_id, len) + Note right of M: 写 ssd_replicas +``` + +#### master + +master 持有 `put` 的权威控制面状态:`inflight_puts` 记录未完成写入,`kv_routes` 记录提交后的当前版本。当前实现里 `PutDoneReq` 只表示内存副本 ready;SSD 副本通过独立 `SsdReplicaCommitReq` 进入 route。 + +当前协议结构如下。 + +```rust +pub struct MasterKvRouterInner { + pub inflight_puts: moka::future::Cache<(String, u64, u32), InflightPutInfo>, + pub kv_routes: DashMap>, + ... +} + +pub struct InflightPutInfo { + pub node_id: NodeID, + pub key: String, + pub req_node_id: NodeID, + pub len: u64, + pub src_target_allocation: Arc>>, +} + +pub struct OneKvNodesRoutes { + pub put_id: PutIDForAKey, + pub nodes_replicas: RwLock>, + pub ssd_replicas: RwLock>, + ... +} + +pub struct PutDoneReq { + pub key: String, + pub put_id: PutIDForAKey, + pub lease_id: Option, +} + +pub struct SsdReplicaCommitReq { + pub key: String, + pub put_id: PutIDForAKey, + pub node_id: NodeIDString, + pub len: u64, +} +``` + +`PutStartReq` 到达 master 后,master 分配 `put_id` 和源/目标 allocation,并把 allocation 放进 `InflightPutInfo.src_target_allocation`。`PutDoneReq` 到达时,master 只把 target allocation 写入 `nodes_replicas`,此时 key-version 已经可被 `get` 命中。SSD owner 后续完成落盘后再发 `SsdReplicaCommitReq`,master 校验 `kv_routes[key].put_id == put_id` 后,把 `KvSsdRouteInfo { node_id, len, tomb_tag }` 写入同一个 `OneKvNodesRoutes.ssd_replicas`。master 不保存 SSD 文件 offset,也不保存 owner 本地 ring index。 + +#### owner + +owner 持有数据面:本机 CPU segment、可选 SSD store、put transfer 和 SSD persist。当前实现里,SSD persist 发生在 master 收到 `PutDoneReq` 并提交内存路由之后,不能阻塞内存副本 ready。 + +当前 owner 字段如下。 + +```rust +pub struct ClientKvApiInner { + ssd_storage: Option>, + rpc_caller_put_start: RPCCaller, + rpc_caller_put_done: RPCCaller, + rpc_caller_ssd_replica_commit: RPCCaller, + ... +} + +pub struct SsdReplicaPersistReq { + pub key: String, + pub put_id: PutIDForAKey, + pub target_addr: u64, + pub len: u64, +} + +pub struct KvSsdStorage { + root_dirs: Vec, + devices: Vec, + shard_to_device: Vec, + next_write_device: AtomicUsize, + inner: Arc>, + space_notify: Arc, +} + +struct SsdDeviceWorker { + device_id: u64, + root_dir: PathBuf, + shard_ids: Vec, + _files: Vec, + _io: Arc, + write_tx: tokio_mpsc::Sender, + read_tx: tokio_mpsc::Sender, +} + +struct KvSsdStorageInner { + ring: SsdRingBuffer, +} +``` + +owner 如果是最终 target,先完成原有 transfer 和 `PutDoneReq`,让内存副本进入 `nodes_replicas`。master 随后在后台 task 里把 `SsdReplicaPersistReq { key, put_id, target_addr, len }` 发回 target owner,并持有 target allocation 的 `Arc`,保证 owner 从内存复制到 SSD 期间 payload 不会被释放或复用。 + +owner 的 `rpc_ssd_replica_persist` handler 收到请求后,从 target allocation 的绝对地址调用 `persist_local_kv_to_ssd(...)`,进入 `KvSsdStorage::persist_from_addr(key, put_id, addr, len)`。`persist_from_addr` 把真实 payload 拷到 512-byte 对齐的 `AlignedBuffer`,`persist_buffer` 通过 `next_write_device` round-robin 选择一个 `SsdDeviceWorker.write_tx` 并等待后台 writer 完成。每个 `ssd_writer_loop` 只拿自己的 `shard_ids` 调 `SsdRingBuffer::prepare_write_on_shards(...)`,在 `ring.entries` 中建立 `Writing(SsdIndexEntry)`;对应 device 的 `UringIoEngine` 对该 shard 文件执行 `writev`,成功后提交为 `Committed(SsdIndexEntry)`。这之后 owner 发送 `SsdReplicaCommitReq` 给 master,补交 SSD 副本。 + +#### external + +external 只持有写入请求上下文和 owner 暴露的 mmap offset,不持有 SSD route 或 SSD index。 + +```rust +pub struct ExternalPutStartReq { + pub key: String, + pub len: u64, + pub reject_if_inflight_same_key: bool, + pub preferred_sub_cluster: Option, + pub started_time: i64, + pub test_observe_put_phases: bool, +} + +pub struct ExternalPutTransferEndReq { + pub key: String, + pub len: u64, + pub src_offset: u64, + pub target_offset: u64, + pub peer_id: Option, + pub target_base_addr: Option, + pub put_id: Option, + pub lease_id: Option, + pub started_time: i64, + pub test_observe_put_phases: bool, +} +``` + +external put 仍然是 `ExternalPutStart -> 写 owner mmap -> ExternalPutTransferEnd`。`ExternalPutTransferEndResp` 只代表内存提交完成;SSD 是否启用、何时 persist 成功、何时写入 `ssd_replicas` 都由 owner 和 master 的内部 commit 协议决定。external 只通过 `started_time` 做 owner 代际校验,避免把旧代际请求提交给新 owner。 + +### get + +```mermaid +sequenceDiagram + participant E as external + participant RO as requester owner + participant M as master + participant SO as SSD owner + participant TE as transfer engine + participant SSD as owner SSD store + + E->>RO: ExternalGetReq(key) + RO->>M: GetStartReq(key) + alt memory replica exists + M-->>RO: GetStartResp(source_kind=Memory) + else SSD fallback + Note right of M: 在 SSD owner 分配 source_allocation\n在 requester owner 分配 target allocation\n写 inflight_gets + M-->>RO: GetStartResp(source_kind=Ssd, src_addr, target_addr, ssd_stage_len) + RO->>SO: SsdStageReadReq(get_id, stage_addr=src_addr, stage_len=ssd_stage_len, target_node_id, target_addr, len) + SO->>SSD: load_into_addr_chunks(key, put_id, stage_addr, len, stage_len) + Note over SSD: pin committed entry\nproducer 按 chunk readv direct 或 scratch fallback + loop each ready chunk + SSD-->>SO: SsdLoadedChunk(offset, stage_addr+offset, chunk_len) + SO->>TE: transfer_data_no_copy(write, stage+offset -> target+offset, chunk_len) + end + SO->>M: GetDoneReq(get_id) + Note right of M: target allocation -> get_holding\nsource_allocation 释放 + M-->>SO: GetDoneResp(holder_id) + SO-->>RO: SsdStageReadResp(done fields) + end + opt source_kind=Memory + RO->>RO: transfer_data_no_copy(read, src_addr -> target_addr, len) + RO->>M: GetDoneReq(get_id) + Note right of M: target allocation -> get_holding + M-->>RO: GetDoneResp(holder_id) + end + RO-->>E: ExternalGetResp(ExternalMemHolderInfo) +``` + +#### master + +master 持有 `get` 的权威路由、在途 allocation 和完成后的 holder authority。 + +```rust +pub struct MasterKvRouterInner { + pub inflight_gets: moka::future::Cache, + pub get_holding: MasterOwnerMemMgr, + pub kv_routes: DashMap>, + ... +} + +pub struct OneKvNodesRoutes { + pub put_id: PutIDForAKey, + pub nodes_replicas: RwLock>, + pub ssd_replicas: RwLock>, + pub get_durable_slots_used: AtomicU32, +} + +pub struct KvSsdRouteInfo { + pub node_id: NodeID, + pub len: u64, + pub tomb_tag: NodeTombTag, +} + +pub struct InflightGetInfo { + pub put_id: PutIDForAKey, + pub src_node_id: NodeID, + pub req_node_id: NodeID, + pub len: u64, + pub allocation: Arc, + pub source_allocation: Option>, + pub route: Arc, + pub allocation_mode: GetAllocationMode, + pub source_kind: GetSourceKind, +} +``` + +master 处理 `GetStartReq` 时,先查 `kv_routes`。有 live 内存副本时,返回 `GetSourceKind::Memory`。只有内存副本不可用时,master 才从 `ssd_replicas` 里选 SSD owner,并分配两块 CPU segment allocation:`source_allocation` 在 SSD owner 上,`allocation` 在 requester owner 上。`GetStartResp.src_addr` 是 SSD owner 本地对齐 staging 地址,`GetStartResp.target_addr` 是 requester target 地址,`GetStartResp.ssd_stage_len` 是对齐后的 source staging 容量,`GetStartResp.len` 始终是真实 payload 长度。 + +`GetDoneReq` 到达后,master 把 `InflightGetInfo.allocation` 转入 `get_holding`,返回 `holder_id`。memory source 路径的 `GetDoneReq` 由 requester owner 发送;SSD source 路径的 `GetDoneReq` 由 SSD owner 在全部 chunk transfer 完成后发送。master 不依赖 RPC 调用者身份决定 holder 归属,而是使用 `InflightGetInfo.req_node_id` 作为 holder 节点。`InflightGetInfo.source_allocation` 只服务 SSD owner 本地读盘 staging 和 owner-side push,不进入 `get_holding`。 + +#### owner + +owner 在 `get` 里有两个可能角色:requester owner 负责调用 master,并根据 `GetSourceKind` 选择 memory transfer 或 SSD stage RPC;SSD owner 负责响应 `SsdStageReadReq`,读取本地 SSD,把读出的 bytes 按 chunk push 到 requester target,并在全部 chunk transfer 完成后向 master 发送 `GetDoneReq`。 + +```rust +pub struct ClientKvApiInner { + ssd_storage: Option>, + pub external_get_holding: OwnerExternalMemMgr, + rpc_caller_get_start: RPCCaller, + rpc_caller_get_done: RPCCaller, + rpc_caller_ssd_stage_read: RPCCaller, + ... +} + +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + pub get_id: u64, + pub stage_addr: u64, + pub stage_len: u64, + pub target_node_id: NodeIDString, + pub target_addr: u64, + pub len: u64, +} + +pub struct SsdStageReadResp { + pub done_holder_id: u64, + pub done_allocation_mode: GetAllocationMode, + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + pub error_code: ErrorCode, + pub error_json: String, +} +``` + +requester owner 收到 `GetSourceKind::Memory` 后走原有 transfer 分支,然后自己发送 `GetDoneReq`。requester owner 收到 `GetSourceKind::Ssd` 后调用 `stage_kv_from_ssd_source(...)`,该函数返回 `GetDoneResp` 对应字段;requester 跳过自己的 transfer,也跳过自己的 `get_done`,直接用返回的 done 结果构造 holder。 + +SSD owner 的 `rpc_ssd_stage_read` task 调用 `load_and_push_kv_from_ssd(...)`。这个函数内部把 `KvSsdStorage::load_into_addr_chunks(...)` 作为生产者,把 `transfer_loaded_ssd_chunks(...)` 作为消费者:生产者 pin 当前 committed entry,按 chunk 把磁盘数据读入 master 分配的 `stage_addr + offset`;消费者每收到一个 `SsdLoadedChunk`,立即用 `transfer_data_no_copy(peer=target_node_id, peer_src_or_target=false, stage_addr + offset, target_addr + offset, chunk_len, None)` push 到 requester target。所有 chunk transfer 成功后,SSD owner 用 `SsdStageReadReq.get_id` 调 master `GetDoneReq`,并把 `GetDoneResp` 拆成 `SsdStageReadResp.done_*` 字段返回给 requester。 + +```rust +struct SsdRingBuffer { + entries: HashMap, + read_pins: HashMap, + ... +} + +enum SsdEntryState { + Writing(SsdIndexEntry), + Committed(SsdIndexEntry), +} +``` + +`read_pins` 是 owner 本地 SSD ring 的生命周期保护,防止 writer 推进 tail 时覆盖 active read。chunk pipeline 在整个 producer 生命周期内持有同一个 read pin;每个 chunk 单独提交 read task。direct read 条件满足时,`readv` 直接写到 `SsdStageReadReq.stage_addr + offset`;否则先读 scratch aligned buffer,再复制当前 chunk 的真实 payload 长度到 staging。请求方 target 是否远端不影响 SSD direct read 的对齐判断。 + +#### external + +external 只发 `ExternalGetReq` 给 owner,并接收 owner 返回的 holder metadata。SSD route、SSD index、source staging allocation 都不会进入 external 进程。 + +```rust +pub struct ExternalGetReq { + pub key: String, + pub req_node_id: String, + pub started_time: i64, +} + +pub struct ExternalGetResp { + pub error_code: ErrorCode, + pub error_json: String, + pub external_memholder_info: Option, +} + +pub struct ExternalMemHolderInfo { + pub offset: u64, + pub len: u32, + pub holder_id: u64, +} + +pub struct ExternalMemHolder { + pub offset: u64, + pub addr: u64, + pub len: u32, + pub holder_id: u64, + pub key: String, + pub external_client_id: String, + pub owner_start_time: i64, + ... +} +``` + +owner 内部普通 `get` 完成后,会把 external 借用关系写入 `external_get_holding`,再返回 `ExternalMemHolderInfo { offset, len, holder_id }`。external 构造 `ExternalMemHolder` 后只通过 mmap offset/addr 读取结果。holder drop 时,external 发 `ExternalDeleteAckReq` 给 owner;owner 再释放 external 借用,并通过原有 owner -> master holder ack 链路释放 `get_holding`。 + +### stage 失败和释放 + +```mermaid +sequenceDiagram + participant RO as requester owner + participant M as master + participant SO as SSD owner + + RO->>SO: SsdStageReadReq + SO-->>RO: stage error + RO->>M: GetRevokeReq(drop_ssd_source=true) + Note right of M: 查 inflight_gets\n确认 source_kind=Ssd\n删除 route.ssd_replicas[src_node_id] + alt no live replica remains + M->>M: remove kv_routes and prefix index + end +``` + +```rust +pub struct GetRevokeReq { + pub get_id: u64, + pub drop_ssd_source: bool, +} +``` + +SSD stage 失败时,请求方调用 `get_revoke_ssd_source(...)`,也就是 `GetRevokeReq { drop_ssd_source: true }`。master 从 `inflight_gets` 找到 `InflightGetInfo`,只有 `source_kind == GetSourceKind::Ssd` 时才会删除 `route.ssd_replicas[src_node_id]`。如果同一 `OneKvNodesRoutes` 下已经没有 live 内存副本和 SSD 副本,master 再删除 `kv_routes` 并异步清理 prefix index。 + +RPC 字段里,`len` 始终是真实 payload 长度;`ssd_stage_len` / `stage_len` 是 SSD direct IO 需要的 staging 容量,通常是 512-byte 对齐后的长度。`target_addr` 只表示 requester target,不再表示 SSD owner 本地 staging。`SsdStageReadReq.get_id` 让 SSD owner 在全部 chunk transfer 完成后替 requester 完成 master `GetDoneReq`;`SsdStageReadResp.done_*` 是 master `GetDoneResp` 的字段投影,供 requester 复用原有 holder 构造逻辑。 + +```rust +pub struct GetStartResp { + pub get_id: u64, + pub node_id: NodeIDString, + pub put_id: PutIDForAKey, + pub source_kind: GetSourceKind, + pub target_addr: u64, + pub src_addr: u64, + pub len: u64, + pub ssd_stage_len: u64, + ... +} + +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + pub get_id: u64, + pub stage_addr: u64, + pub stage_len: u64, + pub target_node_id: NodeIDString, + pub target_addr: u64, + pub len: u64, +} + +pub struct SsdStageReadResp { + pub done_holder_id: u64, + pub done_allocation_mode: GetAllocationMode, + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + pub error_code: ErrorCode, + pub error_json: String, +} +``` + +## 关键代码片段 + +### put_done 只提交内存副本 + +当前实现中,`put_done` 只把内存 target allocation 写入 `nodes_replicas`。SSD 是否落盘不影响这次 `PutDone` 的可见性。 + +```rust +pub struct PutDoneReq { + pub key: String, + pub put_id: PutIDForAKey, + pub lease_id: Option, +} + +one_kv_routes + .nodes_replicas + .write() + .insert(node_id.clone(), completed_info); +``` + +这段逻辑用到的字段边界是: + +- `put_id` 由 `OneKvNodesRoutes` 承载,SSD 副本和内存副本共享同一个版本。 +- `nodes_replicas` 代表内存副本 ready;`get_start` 可以立即从这里返回内存 source。 +- `ssd_replicas` 不能在这一步写入,否则 `PutDone` 会被 SSD 延迟拖住。 + +### SSD replica 独立 commit + +SSD owner 后台 persist 成功后,单独向 master 提交同一个 key-version 的 SSD 副本。master 必须校验当前 route 的 `put_id` 仍然匹配,避免旧版本 SSD late commit 污染新版本路由。 + +```rust +pub struct SsdReplicaCommitReq { + pub key: String, + pub put_id: PutIDForAKey, + pub node_id: NodeIDString, + pub len: u64, +} + +if let Some(route) = kv_routes.get(&req.key) { + if route.put_id == req.put_id { + route.ssd_replicas.write().insert( + node_id.clone(), + KvSsdRouteInfo { + node_id: node_id.clone(), + len: req.len, + tomb_tag, + }, + ); + } +} +``` + +这段逻辑用到的字段边界是: + +- `SsdReplicaCommitReq.put_id` 必须等于当前 `OneKvNodesRoutes.put_id`。 +- `SsdReplicaCommitReq.node_id` 必须对应当前 route 内已经 ready 的内存副本;master 用同一节点的 `KvRouteInfo.tomb_tag` 作为 SSD route 的 tomb 代际。 +- `SsdReplicaCommitReq.len` 记录真实 payload 长度,后续 SSD stage 和 transfer 都按这个长度对外可见。 +- `KvSsdRouteInfo` 不保存 SSD 文件 offset;offset 只在 owner 本地 SSD ring index 中。 +- late commit 命中过期 `put_id` 时直接丢弃,不能 resurrect 旧版本。 + +### get_start 分配分布式 SSD staging + +SSD fallback 发生在 master 已经没有可用 `nodes_replicas` 之后。source staging 一定分配在 SSD owner 的 CPU segment 上,target allocation 一定分配在 requester 的 CPU segment 上。 + +```rust +let ssd_stage_len = align_ssd_io_len(ssd_replica.len)?; +let source_alloc_len = ssd_stage_len + SSD_ALIGNMENT as u64 - 1; + +let source_allocation = allocate_get_buffer_on_node( + &view, + &ssd_replica.node_id, + source_alloc_len, + get_id, + "ssd source staging", +)?; +let target_allocation = allocate_get_buffer_on_node( + &view, + &req_node_id, + ssd_replica.len, + get_id, + "requesting target", +)?; + +let source_addr = align_ssd_stage_addr(source_base + source_allocation.addr())?; +``` + +这段逻辑的关键字段关系是: + +- `KvSsdRouteInfo.node_id` 决定 source staging 的 owner。 +- `source_alloc_len = align_up(len, 512) + 511`,保证 allocation 内总能找到 512-byte 对齐的 `src_addr`。 +- `GetStartResp.src_addr` 返回对齐后的绝对地址,不一定等于 `source_allocation` 的起始地址。 +- `InflightGetInfo.source_allocation` 持有原始 allocation,保证对齐后的 `src_addr` 在整个 SSD read/push 期间有效。 +- `InflightGetInfo.allocation` 持有 requester target;memory source 由 requester `get_done` 转成 holder,SSD source 由 SSD owner `get_done` 转成 holder。 + +### requester 触发 SSD owner stage/push/done + +请求方收到 `GetSourceKind::Ssd` 后,让 SSD owner 把数据读入 `src_addr`、按 chunk push 到 `target_addr + offset`,并由 SSD owner 直接完成 master `get_done`。这里没有新增用户 API;`SsdStageReadReq` 是 owner 内部 RPC。stage RPC 成功返回时,requester target 已经可读,并且 requester 已经拿到 master done 结果;请求方跳过自己的 transfer 分支,也跳过自己的 `get_done`。 + +```rust +let mut ssd_done_resp = None; +if resp.source_kind == GetSourceKind::Ssd { + let done_resp = self.stage_kv_from_ssd_source( + &resp.node_id, + key, + put_id, + get_id, + resp.src_addr, + resp.target_addr, + data_len as u64, + resp.ssd_stage_len, + ) + .await?; + ssd_done_resp = Some(done_resp); +} + +if resp.source_kind == GetSourceKind::Ssd { + // SSD owner already pushed all chunks to target_addr and called get_done. +} else { + self.view.client_transfer_engine() + .transfer_data_no_copy(peer_id, true, resp.src_addr, resp.target_addr, len, None) + .await?; +} + +let done_resp = if let Some(done_resp) = ssd_done_resp { + done_resp +} else { + self.get_done(get_id).await? +}; +``` + +`stage_kv_from_ssd_source(...)` 的分支只有两个: + +- `source_node_id == self`:本地调用 `load_and_push_kv_from_ssd(...)`,SSD read 生产 chunk,transfer consumer 把每个 chunk 写到本地 `target_addr + offset`,随后直接调用 `get_done(get_id)`。 +- 远端 SSD owner:发送 `SsdStageReadReq`,由 `rpc_ssd_stage_read` task 执行 `load_and_push_kv_from_ssd(...)`,SSD owner 每读出一个 chunk 就 push 到 requester target,全部 chunk transfer 完成后再调 `get_done(get_id)` 并通过 `SsdStageReadResp.done_*` 返回。 + +### SSD chunk read 与 direct/scratch fallback + +SSD owner 侧的核心结构是 `SsdLoadedChunk` 和 `ReadCommand`。`SsdLoadedChunk` 是 read producer 交给 transfer consumer 的最小就绪单元;`ReadCommand.file_offset` 让同一个 committed entry 可以按 chunk 提交不同文件偏移的读。 + +```rust +pub(crate) struct SsdLoadedChunk { + pub offset: u64, + pub stage_addr: u64, + pub len: u64, +} + +struct ReadCommand { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} +``` + +`load_and_push_kv_from_ssd(...)` 把 SSD read 和 transfer 并行起来。producer 最多保留 `DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT` 个读 IO;consumer 最多保留同样数量的 transfer future。这样大 payload 场景里,前一个 chunk 还在网络传输时,后续 chunk 可以继续从 SSD 读入 staging。 + +```rust +let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel( + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT.saturating_mul(2).max(1), +); + +let producer = store.load_into_addr_chunks( + key, + put_id, + stage_addr, + len, + stage_len, + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + chunk_tx, +); +let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx); +let (producer_res, consumer_res) = ::tokio::join!(producer, consumer); +``` + +`load_into_addr_chunks(...)` 先 pin 当前 committed entry,pin 生命周期覆盖整个 producer。每个 chunk 根据 `stage_addr + offset`、`entry.file_offset + offset` 和剩余 staging 容量选择 direct 或 scratch;chunk read 完成后立即发送 `SsdLoadedChunk`。 + +```rust +let (entry, _read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { key: key.key.clone() })); + }; + (entry, SsdReadPin { ... }) +}; + +let file_offset = entry.file_offset + offset; +let target = match choose_chunk_read_path(stage_addr, read_len, target_len, file_offset) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr: stage_addr, + len: read_len as usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len as usize)?), +}; + +let output = submit_read_command(key, entry, file_offset, target, None).await?; +if let ReadOutput::Scratch(buffer) = output { + copy_payload_to_stage(buffer, stage_addr, payload_len)?; +} +ready_tx.send(SsdLoadedChunk { offset, stage_addr, len: payload_len }).await?; +``` + +direct 路径把 `readv` 的目标直接设为当前 chunk 的 source staging;scratch 路径先读入 aligned buffer,再只复制当前 chunk 的真实 payload 长度到 staging。两条路径最后都只把真实 payload 长度暴露给 transfer 和 `MemHolder`。 + +## IO 模型 + +```mermaid +flowchart TD + A["large_file_paths"] --> B["derive SSD roots"] + B --> C["create root dirs + metadata.dev()"] + C --> D["deduplicate device roots"] + D --> E0["SsdDeviceWorker device 0"] + D --> E1["SsdDeviceWorker device 1"] + E0 --> F0["shard_ids: 0,2,..."] + E1 --> F1["shard_ids: 1,3,..."] + F0 --> G0["device 0 writer/read queues"] + F1 --> G1["device 1 writer/read queues"] + G0 --> H0["device 0 UringIoEngine"] + G1 --> H1["device 1 UringIoEngine"] + I["persist_buffer"] --> J["next_write_device round-robin"] + J --> G0 + J --> G1 + K["submit_read_command(entry.shard_id)"] --> L["shard_to_device"] + L --> G0 + L --> G1 +``` + +| 组件 | 设计 | +| --- | --- | +| device root | owner 从 `large_file_paths` 派生 SSD root,创建目录后读取 `metadata.dev()`;同一 device 只保留第一个 root。 | +| shard 文件 | 每个 owner 将 `max_bytes` 切成少量 shard,文件位于有效 device root 的 `shards/` 下,`shard_to_device` 记录 shard 到 device worker 的映射。 | +| 对齐 | 数据写入前复制到 512-byte 对齐 buffer,实际 IO 长度按 512-byte 向上对齐。 | +| 写队列 | `persist_from_addr` 只把任务送入某个 device 的有界 writer queue;后台 writer 控制 inflight 数量,并只在本 device 的 `shard_ids` 内分配 ring 空间。 | +| 读队列 | `load_into_addr_chunks` 先 pin committed 索引,再按 `entry.shard_id -> shard_to_device` 找到对应 device reader queue。只要 chunk staging 地址、文件 offset 和 staging 容量满足对齐约束,就直接读入目标 staging;否则读到 scratch aligned buffer 后只复制当前 chunk 的真实 payload 长度。 | +| io_uring | 每个有效 device 拥有自己的 `UringIoEngine`,engine 内多个后台线程持有 `IoUring`,使用 `readv/writev` 提交该 device 的 shard 文件 IO。底层每个 uring shard 有独立 read/write 发送队列,按 read/write inflight 比例调度,优先保护 kvcache 回填读延迟。 | +| 索引状态 | 新写入先进入 `Writing`;只有 IO 完成且 offset 仍有效时才转为 `Committed`。 | +| 位置保护 | `load_into_addr_chunks` 在 producer 生命周期内 pin committed entry;writer 分配新 ring 空间前检查 pinned read 和未完成 `Writing` entry,必要时等待 active IO 释放位置。 | +| ring 失效 | shard head 推进超过容量时推进 tail,并移除被覆盖 key-version 的本地索引。 | + +## Task / Actor / 独立线程 + +SSD 路径里有三层异步执行单元。控制面仍复用 KV 原有 actor;SSD 只为 owner 本地磁盘 IO 增加后台 task 和独立 uring 线程。owner 内部的 SSD task 按去重后的 effective device 创建;多个 `large_file_paths` 如果落在同一个 `metadata.dev()` 上,只创建一组 device worker。 + +| 执行单元 | 创建位置 | 类型 | 输入 | 职责 | +| --- | --- | --- | --- | --- | +| `ssd_writer_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.write_tx` | 从 `persist_from_addr` 接收写任务,只在本 device 的 `shard_ids` 内调用 `SsdRingBuffer::prepare_write_on_shards`,提交 `writev`,完成后 `commit(Writing -> Committed)`。 | +| `ssd_reader_loop` | `KvSsdStorage::new`,每个 effective device 一个 | `tokio::task::spawn` | `SsdDeviceWorker.read_tx` | 从 `load_into_addr_chunks` 接收属于本 device shard 的 chunk 读任务,提交 direct/scratch `readv`,校验 offset 仍有效,完成后回传 chunk 读结果;整条 producer 完成后释放 `SsdReadPin`。 | +| `fluxon-kv-ssd-uring-{idx}` | 每个 device 的 `UringIoEngine::new_multi` | `std::thread::spawn` | `read_rx/write_rx: crossbeam::channel` | 每个线程持有一个 `IoUring`,只提交本 device shard 文件的 `Readv/Writev` SQE,并按 read/write inflight 比例调度后回传 CQE。 | +| `rpc_ssd_replica_commit` | `MasterKvRouter` RPC handler 注册 | `view.spawn(...)` | `SsdReplicaCommitReq` | owner SSD persist 成功后提交 SSD 副本,master 校验 `put_id` 后写 `ssd_replicas`。 | +| `rpc_ssd_stage_read` | `ClientKvApi` RPC handler 注册 | `view.spawn(...)` | `SsdStageReadReq` | 远端 SSD owner 收到 stage 请求后,在 owner 进程内调用 `load_and_push_kv_from_ssd(...)`;SSD read producer 和 transfer consumer 流水线完成后,再调用 master `get_done` 并回传 done fields。 | +| `ssd_failure_remove_prefix_index` | `get_revoke(drop_ssd_source=true)` | `view.spawn(...)` | 失败 SSD source 的 key | 当失败 SSD source 是最后一个 live replica 时,异步删除 prefix index。 | + +没有单独的 SSD master route actor。SSD route 的权威更新点仍是原有 master RPC handler: + +- `PutDone`:同步更新 `nodes_replicas`,让内存副本立即可读。 +- `SsdReplicaCommit`:SSD persist 完成后同步更新 `ssd_replicas`,并拒绝过期 `put_id`。 +- `GetStart`:同步选择内存副本或 SSD 副本,并写入 `inflight_gets`。 +- `GetRevoke`:同步删除失败 SSD source;必要时触发 prefix index 小任务。 +- `Delete` / 覆盖写失效:复用原有 `delete_broadcast` 管线。 + +后台 task 的生命周期绑定 `KvSsdStorage`: + +```rust +for device in deduplicated_device_roots { + let io = Arc::new(UringIoEngine::new_multi(device_shard_fds, cfg)?); + task::spawn(ssd_writer_loop(..., shard_ids.clone())); + task::spawn(ssd_reader_loop(...)); + devices.push(SsdDeviceWorker { shard_ids, _io: io, ... }); +} + +std::thread::Builder::new() + .name(format!("fluxon-kv-ssd-uring-{idx}")) + .spawn(move || UringShard { ... }.run())?; +``` + +`KvSsdStorage` 通过每个 `SsdDeviceWorker` 持有 `_files` 和 `_io`,确保该 device 的 shard fd 与 uring 线程生命周期覆盖所有读写 task。`UringIoEngine::drop` 会关闭 read/write channel,并 join 所有 uring 线程。 + +## 3FS 和 foyer 对照 + +| 参考点 | 对 kvcache SSD 的结论 | +| --- | --- | +| foyer read/write split queue | 已落地到底层 `UringIoEngine`。写入 flush 和回填读进入不同发送队列,同一 uring shard 内用 inflight 比例避免读饥饿。 | +| foyer multi-partition device | 已落地到 owner 内部 per-device worker。`large_file_paths` 仍是唯一配置来源;owner 按 `metadata.dev()` 去重后为每个有效 device 建独立 writer/read queue、uring engine 和 shard 集。 | +| foyer block buffer/reclaimer | 适合后续把小 key-version 合并成 blob,并用 blob index 加速恢复;当前 kvcache value 以较大连续 payload 为主,先保留单 key-version 连续写入。 | +| 3FS write-new-position then commit metadata | 当前 `Writing/Committed` 两阶段索引已经匹配这条原则:IO 成功前不暴露 SSD 副本。 | +| 3FS read holds chunk position reference | 已落地到 SSD ring 内部。读提交前 pin entry;tail 推进和物理 offset 复用必须避开 pinned read。 | +| 3FS aligned direct read | 已落地 aligned fast path。master 自己控制 SSD source staging allocation,因此可以在 allocation 内选择对齐后的 source 地址,并把 SSD IO 长度扩到 512-byte 对齐;真实 payload 长度仍用于 transfer 和用户可见 `MemHolder`。 | +| 3FS batch read/RDMA response | Fluxon 已复用现有 transfer engine,并已落地 read/transfer chunk pipeline;后续优化重点放在批量 SSD stage、批量 transfer 和小窗口 staging allocation。 | +| PegaFlow fire-and-forget SSD ingest | 已落地到 put 路径。master 在 `PutDone` 中先提交内存 route,再通过后台 `post_put_ssd_replica_persist` 触发 owner 本地 SSD persist;owner 落盘成功后用 `SsdReplicaCommitReq` 独立提交 SSD route。 | + +## 不变量 + +- `ssd_replicas` 和 `nodes_replicas` 都属于同一个 `OneKvNodesRoutes.put_id`,不能跨版本复用。 +- `PutDoneReq` 只表示内存副本 ready,不能记录 SSD 副本。 +- master 只有在收到匹配当前 `put_id` 的 `SsdReplicaCommitReq` 后才能记录 SSD 副本。 +- `SsdReplicaCommitReq` 是内部控制面 RPC,不改变用户侧 `put/get/delete` API。 +- `GetSourceKind::Ssd` 必须有 source staging allocation,并由 master 持有到 SSD owner 发起的 `get_done` 或 requester 发起的 `get_revoke`。 +- SSD 回填失败必须通过 `get_revoke(drop_ssd_source=true)` 清理 in-flight get,并从 master 路由里移除失败的 SSD 副本。 +- SSD ring 本地失效后,master 可能短暂保留旧 SSD 路由;下一次 stage 失败会触发主动路由失效。 +- SSD ring tail 推进不能覆盖 active IO:未完成的 `Writing` entry 和 pinned read entry 必须先释放。 +- SSD direct stage 只在目标地址、SSD 内部对齐长度和文件 offset 都满足 512-byte 对齐,且 staging 容量覆盖对齐长度时启用;transfer 和用户可见 `MemHolder` 长度始终保持真实 payload 长度。 +- master 路由被删除后,旧 SSD bytes 即使还在 shard 文件里,也不能被公共 `get` 命中。 + +## 关键结论 + +这套实现把 SSD 做成和 CPU segment 同级的分布式数据源副本,但不新增并行的用户 API 或传输协议。Pegaflow 的优势被放在 owner 内部 IO 层:异步 direct IO、分片 ring、提交态隔离和队列背压;foyer 的 read/write 队列调度用于保护回填读延迟;3FS 的位置生命周期、aligned direct read 和 read/transfer chunk pipeline 用于保护 active IO 并减少大 payload 回填拷贝和串行等待。后续重点是批量 SSD stage、批量 transfer、小窗口 staging allocation 和 pipeline 观测指标。Fluxon 的优势继续由原有内存 KV 路由、allocation、transfer 和 holder 生命周期承接。 diff --git a/fluxon_rs/Cargo.lock b/fluxon_rs/Cargo.lock index a4b0ecd..3e8638a 100644 --- a/fluxon_rs/Cargo.lock +++ b/fluxon_rs/Cargo.lock @@ -1237,6 +1237,7 @@ dependencies = [ "hyper 0.14.32", "iceoryx2", "iceoryx2-cal", + "io-uring", "kanal", "lazy_static", "libc", @@ -2395,6 +2396,17 @@ dependencies = [ "str_stack", ] +[[package]] +name = "io-uring" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9080b15e63775b9a2ac7dca720f7050a8b955e092ea0f6020a4a80f69998cdc0" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.11.0" diff --git a/fluxon_rs/fluxon_kv/Cargo.toml b/fluxon_rs/fluxon_kv/Cargo.toml index 22ff136..8208216 100644 --- a/fluxon_rs/fluxon_kv/Cargo.toml +++ b/fluxon_rs/fluxon_kv/Cargo.toml @@ -75,6 +75,7 @@ bytes = "1" pprof = { version = "0.15", features = ["flamegraph"] } hex = "0.4" sha2 = "0.10" +io-uring = "0.7" tokio-tungstenite = { version = "0.21", default-features = false, features = ["connect", "handshake"], optional = true } sockudo-ws = { version = "^1.7.4", default-features = false, features = ["tokio-runtime", "fastrand"], optional = true } diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs index f309dd0..29da3f8 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/get.rs @@ -13,7 +13,7 @@ use crate::{ cluster_manager::NodeID, master_kv_router::msg_pack::{ GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq, - GetStartReq, GetStartResp, + GetSourceKind, GetStartReq, GetStartResp, }, p2p::msg_pack::MsgPack, rpcresp_kvresult_convert::msg_and_error::codes_api, @@ -26,19 +26,27 @@ use std::sync::Arc; pub struct RemoteGetInfo { get_id: u64, data_len: usize, + source_kind: GetSourceKind, src_addr: u64, target_addr: u64, node_id: NodeID, peer_is_src_or_target: bool, } +impl RemoteGetInfo { + pub fn source_kind(&self) -> GetSourceKind { + self.source_kind + } +} + impl std::fmt::Display for RemoteGetInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "GetInfo{{ get_id: {}, data_len: {} bytes, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}", + "GetInfo{{ get_id: {}, data_len: {} bytes, source_kind: {:?}, src_addr: {:#x}, target_addr: {:#x}, node_id: {:?}, remote_transfer: {} }}", self.get_id, self.data_len, + self.source_kind, self.src_addr, self.target_addr, self.node_id, @@ -177,8 +185,80 @@ impl ClientKvApiInner { ); } + let mut ssd_done_resp = None; + if resp.source_kind == GetSourceKind::Ssd { + let ssd_stage_len = resp.ssd_stage_len; + if ssd_stage_len < data_len as u64 { + #[cfg(test)] + { + self.test_record.remove_transfering_get(get_id); + } + + self.get_revoke(get_id).await?; + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "invalid ssd stage len for key={} get_id={} data_len={} ssd_stage_len={}", + key, get_id, data_len, ssd_stage_len + ), + })); + } + let done_resp = match self + .stage_kv_from_ssd_source( + &resp.node_id, + key, + put_id, + get_id, + abs_src, + abs_target, + data_len as u64, + ssd_stage_len, + ) + .await + { + Ok(done_resp) => done_resp, + Err(err) => { + tracing::warn!( + "kv get ssd stage failed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}, err={}", + key, + resp.node_id, + abs_src, + abs_target, + data_len, + ssd_stage_len, + err + ); + + #[cfg(test)] + { + self.test_record.remove_transfering_get(get_id); + } + + obe_get_transfer_error(&metrics, &client_id, &node_role, key, data_len as u64); + self.get_revoke_ssd_source(get_id).await?; + return Err(err); + } + }; + ssd_done_resp = Some(done_resp); + tracing::debug!( + "kv get ssd staged and pushed: key={}, source_node={}, stage={:#x}, target={:#x}, len={}, ssd_stage_len={}", + key, + resp.node_id, + abs_src, + abs_target, + data_len, + ssd_stage_len + ); + } + // transfer data (skip if local and src==target to avoid redundant copy) - if peer_id.is_none() && abs_src == abs_target { + if resp.source_kind == GetSourceKind::Ssd { + tracing::debug!( + "kv get ssd owner push complete: key={}, target={:#x}, len={} (skip requester transfer)", + key, + abs_target, + data_len + ); + } else if peer_id.is_none() && abs_src == abs_target { tracing::debug!( "kv get local no-op: src==target {:#x}, len={} (skip transfer)", abs_target, @@ -249,12 +329,17 @@ impl ClientKvApiInner { // Removed post-transfer zero-header verification per request. - // Complete the get operation and get holder_id - let done_resp = match self.get_done(get_id).await { - Ok(resp) => resp, - Err(err) => { - obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64); - return Err(err); + // Complete the get operation and get holder_id. SSD source already called + // get_done after pushing into the requester target. + let done_resp = if let Some(done_resp) = ssd_done_resp { + done_resp + } else { + match self.get_done(get_id).await { + Ok(resp) => resp, + Err(err) => { + obe_get_end_error_rpc(&metrics, &client_id, &node_role, key, data_len as u64); + return Err(err); + } } }; let end_handle_us = done_resp.server_process_us; @@ -326,6 +411,7 @@ impl ClientKvApiInner { let get_info = RemoteGetInfo { get_id, data_len, + source_kind: resp.source_kind, src_addr: abs_src, target_addr: abs_target, node_id: resp.node_id.into(), @@ -435,8 +521,19 @@ impl ClientKvApiInner { /// 撤销 Get 操作,释放已分配的资源 pub async fn get_revoke(&self, get_id: u64) -> KvResult<()> { + self.get_revoke_inner(get_id, false).await + } + + async fn get_revoke_ssd_source(&self, get_id: u64) -> KvResult<()> { + self.get_revoke_inner(get_id, true).await + } + + async fn get_revoke_inner(&self, get_id: u64, drop_ssd_source: bool) -> KvResult<()> { let req = MsgPack { - serialize_part: GetRevokeReq { get_id }, + serialize_part: GetRevokeReq { + get_id, + drop_ssd_source, + }, raw_bytes: Vec::new(), }; diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs index dec19f5..bd4655b 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/mod.rs @@ -3,11 +3,17 @@ use crate::client_kv_api::msg_pack::{ ExternalDeleteAckReq, ExternalDeleteAckResp, ExternalDeleteReq, ExternalDeleteResp, ExternalGetReq, ExternalGetResp, ExternalIsExistReq, ExternalIsExistResp, ExternalPutCommitReq, ExternalPutCommitResp, ExternalPutRevokeReq, ExternalPutRevokeResp, ExternalPutStartReq, - ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp, SyncKvToFileReq, - SyncKvToFileResp, TestPutPhaseTrace, + ExternalPutStartResp, ExternalPutTransferEndReq, ExternalPutTransferEndResp, + SsdReplicaPersistReq, SsdReplicaPersistResp, SsdStageReadReq, SsdStageReadResp, + SyncKvToFileReq, SyncKvToFileResp, TestPutPhaseTrace, }; use crate::cluster_manager::NodeIDString; +use crate::cluster_manager::app_logic_ext::ClusterManagerAppLogicExt; use crate::config::TestSpecConfig; +use crate::kv_ssd_storage::{ + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + KvSsdStorage, KvSsdStorageInit, SsdLoadedChunk, +}; use crate::master_kv_router::msg_pack::{ BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, DeleteClientKvMetaCacheItem, }; @@ -22,8 +28,8 @@ use crate::{ client_transfer_engine::{ClientTransferEngine, ClientTransferEngineAccessTrait}, cluster_manager::{ClusterEvent, ClusterManager, ClusterManagerAccessTrait}, master_kv_router::msg_pack::{ - DeleteReq, GetDoneReq, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq, PutRevokeReq, - PutStartReq, + DeleteReq, GetDoneReq, GetDoneResp, GetMetaReq, GetRevokeReq, GetStartReq, PutDoneReq, + PutRevokeReq, PutStartReq, SsdReplicaCommitReq, }, metric_reporter::{MetricReporter, MetricReporterAccessTrait}, metrics::{MetricsHandle, OperationKind, RequestStage}, @@ -37,6 +43,7 @@ use async_trait::async_trait; use dashmap::DashMap; use fluxon_framework::{LogicalModule, define_module}; use fluxon_util::map_lock::AMapLock; +use futures::stream::{FuturesUnordered, StreamExt}; use limit_thirdparty::tokio; use parking_lot::Mutex; use std::sync::Weak; @@ -451,6 +458,89 @@ async fn handle_external_put_revoke( } } +async fn handle_ssd_stage_read( + view: &ClientKvApiView, + msg: &MsgPack, +) -> MsgPack { + let req = msg.serialize_part.clone(); + let inner = view.client_kv_api().inner(); + let done_resp = match inner + .load_and_push_kv_from_ssd( + &req.key, + req.put_id, + req.stage_addr, + req.stage_len, + &req.target_node_id, + req.target_addr, + req.len, + ) + .await + { + Ok(()) => inner.get_done(req.get_id).await, + Err(err) => Err(err), + }; + + match done_resp { + Ok(done_resp) => MsgPack { + serialize_part: SsdStageReadResp { + done_holder_id: done_resp.holder_id, + done_allocation_mode: done_resp.allocation_mode, + done_error_code: done_resp.error_code, + done_error_json: done_resp.error_json, + done_server_process_us: done_resp.server_process_us, + error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + }, + Err(err) => MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }, + } +} + +async fn handle_ssd_replica_persist( + view: &ClientKvApiView, + msg: &MsgPack, +) -> MsgPack { + let req = msg.serialize_part.clone(); + let inner = view.client_kv_api().inner(); + let persisted = match inner + .persist_local_kv_to_ssd(&req.key, req.put_id, req.target_addr, req.len) + .await + { + Ok(persisted) => persisted, + Err(err) => { + return MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }; + } + }; + + if persisted { + if let Err(err) = inner + .commit_ssd_replica_to_master(&req.key, req.put_id, req.len) + .await + { + return MsgPack { + serialize_part: crate::rpcresp_kvresult_convert::FromError::from_error(&err), + raw_bytes: Vec::new(), + }; + } + } + + MsgPack { + serialize_part: SsdReplicaPersistResp { + persisted, + error_code: crate::rpcresp_kvresult_convert::msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + } +} + async fn handle_external_delete_ack( view: &ClientKvApiView, msg: &MsgPack, @@ -729,6 +819,7 @@ define_module!( #[derive(Clone, Debug)] pub struct ClientKvApiNewArg { pub test_spec_config: TestSpecConfig, + pub ssd_storage: Option, } pub struct ClientKvApi(ClientKvApiInner); @@ -775,6 +866,7 @@ impl std::ops::Deref for ClientKvApiViewHolder { pub struct ClientKvApiInner { view: ClientKvApiViewHolder, test_spec_config: TestSpecConfig, + ssd_storage: Option>, metrics: OnceLock>, /// make sure each remote kv get run in order @@ -818,6 +910,8 @@ pub struct ClientKvApiInner { rpc_caller_external_put_commit: RPCCaller, rpc_caller_external_put_revoke: RPCCaller, rpc_caller_resolve_side_transfer_lane: RPCCaller, + rpc_caller_ssd_stage_read: RPCCaller, + rpc_caller_ssd_replica_commit: RPCCaller, /// Default lease id recorded for inspection/convenience, but NOT auto-applied. /// Callers must explicitly pass `Some(lease_id)` to attach a put to a lease. @@ -900,6 +994,222 @@ impl ClientKvApiInner { pub(crate) fn skip_put_end_commit_enabled(&self) -> bool { self.test_spec_config.skip_put_end_commit } + + pub(crate) async fn persist_local_kv_to_ssd( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + abs_addr: u64, + len: u64, + ) -> KvResult { + let Some(store) = self.ssd_storage.as_ref() else { + return Ok(false); + }; + store.persist_from_addr(key, put_id, abs_addr, len).await?; + Ok(true) + } + + pub(crate) async fn commit_ssd_replica_to_master( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + len: u64, + ) -> KvResult<()> { + let node_id = self.view.cluster_manager().get_self_info().id.clone(); + let req = MsgPack { + serialize_part: SsdReplicaCommitReq { + key: key.to_string(), + put_id, + node_id, + len, + }, + raw_bytes: Vec::new(), + }; + let master_node_id = self + .view + .cluster_manager() + .find_or_wait_master_node() + .await?; + let resp = self + .rpc_caller_ssd_replica_commit + .call( + self.view.p2p_module(), + master_node_id.into(), + req, + Some(Duration::from_secs(60)), + 2, + ) + .await + .map_err(KvError::from)?; + crate::rpcresp_kvresult_convert::try_from_code( + resp.serialize_part.error_code, + resp.serialize_part.error_json, + ) + } + + pub(crate) async fn load_and_push_kv_from_ssd( + &self, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + stage_addr: u64, + stage_len: u64, + target_node_id: &NodeIDString, + target_addr: u64, + len: u64, + ) -> KvResult<()> { + let Some(store) = self.ssd_storage.as_ref() else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage is not enabled on this owner".to_string(), + })); + }; + + let self_node_id = &self.view.cluster_manager().get_self_info().id; + let peer_id = if target_node_id == self_node_id { + None + } else { + Some(target_node_id.clone()) + }; + let (chunk_tx, chunk_rx) = ::tokio::sync::mpsc::channel( + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT + .saturating_mul(2) + .max(1), + ); + let producer = store.load_into_addr_chunks( + key, + put_id, + stage_addr, + len, + stage_len, + DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES, + DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT, + chunk_tx, + ); + let consumer = self.transfer_loaded_ssd_chunks(peer_id, target_addr, chunk_rx); + let (producer_res, consumer_res) = ::tokio::join!(producer, consumer); + match (producer_res, consumer_res) { + (Ok(()), Ok(())) => Ok(()), + (_, Err(err)) => Err(err), + (Err(err), _) => Err(err), + } + } + + async fn transfer_loaded_ssd_chunks( + &self, + peer_id: Option, + target_addr: u64, + mut chunk_rx: ::tokio::sync::mpsc::Receiver, + ) -> KvResult<()> { + let mut inflight = FuturesUnordered::new(); + let mut rx_open = true; + + loop { + tokio::select! { + maybe_chunk = chunk_rx.recv(), if rx_open && inflight.len() < DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT => { + match maybe_chunk { + Some(chunk) => { + let chunk_target_addr = target_addr.checked_add(chunk.offset).ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd transfer target addr overflow: target_addr={:#x} offset={}", + target_addr, + chunk.offset + ), + }) + })?; + let transfer_engine = self.view.client_transfer_engine(); + let peer_id = peer_id.clone(); + inflight.push(async move { + transfer_engine + .transfer_data_no_copy( + peer_id, + false, + chunk.stage_addr, + chunk_target_addr, + chunk.len, + None, + ) + .await?; + Ok::<(), KvError>(()) + }); + } + None => { + rx_open = false; + } + } + } + Some(result) = inflight.next(), if !inflight.is_empty() => { + result?; + } + else => { + if !rx_open && inflight.is_empty() { + break; + } + } + } + } + Ok(()) + } + + pub(crate) async fn stage_kv_from_ssd_source( + &self, + source_node_id: &NodeIDString, + key: &str, + put_id: crate::master_kv_router::put::PutIDForAKey, + get_id: u64, + stage_addr: u64, + target_addr: u64, + len: u64, + stage_len: u64, + ) -> KvResult { + let self_node_id = self.view.cluster_manager().get_self_info().id.clone(); + if source_node_id == &self_node_id { + self.load_and_push_kv_from_ssd( + key, + put_id, + stage_addr, + stage_len, + &self_node_id, + target_addr, + len, + ) + .await?; + return self.get_done(get_id).await; + } + + let req = MsgPack { + serialize_part: SsdStageReadReq { + key: key.to_string(), + put_id, + get_id, + stage_addr, + stage_len, + target_node_id: self_node_id, + target_addr, + len, + }, + raw_bytes: Vec::new(), + }; + let resp = self + .rpc_caller_ssd_stage_read + .call( + self.view.p2p_module(), + source_node_id.clone().into(), + req, + Some(Duration::from_secs(60)), + 0, + ) + .await + .map_err(KvError::from)?; + let resp = resp.serialize_part; + crate::rpcresp_kvresult_convert::try_from_code(resp.error_code, resp.error_json)?; + Ok(GetDoneResp { + holder_id: resp.done_holder_id, + allocation_mode: resp.done_allocation_mode, + error_code: resp.done_error_code, + error_json: resp.done_error_json, + server_process_us: resp.done_server_process_us, + }) + } } #[derive(Debug, Clone)] @@ -1518,10 +1828,16 @@ impl ClientKvApi { pub async fn construct(arg: ClientKvApiNewArg) -> Result { tracing::info!("Constructing ClientKvApi in Client mode (PreView)"); + let ssd_storage = arg + .ssd_storage + .map(KvSsdStorage::new) + .transpose()? + .map(Arc::new); let inner = ClientKvApiInner { view: ClientKvApiViewHolder::new(), test_spec_config: arg.test_spec_config, + ssd_storage, metrics: OnceLock::new(), all_memholder_refcount: OnceLock::new(), get_remote_kv_lock: AMapLock::new(Duration::from_secs(60)), @@ -1554,6 +1870,8 @@ impl ClientKvApi { rpc_caller_external_put_commit: RPCCaller::new(), rpc_caller_external_put_revoke: RPCCaller::new(), rpc_caller_resolve_side_transfer_lane: RPCCaller::new(), + rpc_caller_ssd_stage_read: RPCCaller::new(), + rpc_caller_ssd_replica_commit: RPCCaller::new(), default_lease_id: parking_lot::RwLock::new(None), }; Ok(Self(inner)) @@ -1587,6 +1905,12 @@ impl ClientKvApi { inner .rpc_caller_resolve_side_transfer_lane .regist(inner.view.p2p_module()); + inner + .rpc_caller_ssd_stage_read + .regist(inner.view.p2p_module()); + inner + .rpc_caller_ssd_replica_commit + .regist(inner.view.p2p_module()); crate::key_prefix::init_for_p2p_owner(inner.view.p2p_module()); crate::kvlease::init_for_p2p_owner(inner.view.p2p_module()); // Register master-only metric RPC callers @@ -1686,6 +2010,31 @@ impl ClientKvApi { }, ); + let view_ext = inner.view.clone_view(); + RPCHandler::::new().regist(inner.view.p2p_module(), move |resp, msg| { + let view = view_ext.clone(); + let view_task = view.clone(); + let _ = view.spawn("rpc_ssd_stage_read", async move { + let result = handle_ssd_stage_read(&view_task, &msg).await; + let _ = resp.send_resp(result).await; + }); + Ok(()) + }); + + let view_ext = inner.view.clone_view(); + RPCHandler::::new().regist( + inner.view.p2p_module(), + move |resp, msg| { + let view = view_ext.clone(); + let view_task = view.clone(); + let _ = view.spawn("rpc_ssd_replica_persist", async move { + let result = handle_ssd_replica_persist(&view_task, &msg).await; + let _ = resp.send_resp(result).await; + }); + Ok(()) + }, + ); + let view_ext = inner.view.clone_view(); RPCHandler::::new().regist(inner.view.p2p_module(), move |resp, msg| { let view = view_ext.clone(); diff --git a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs index 55f0970..bae5437 100644 --- a/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs +++ b/fluxon_rs/fluxon_kv/src/client_kv_api/msg_pack.rs @@ -1,8 +1,10 @@ +use crate::master_kv_router::msg_pack::GetAllocationMode; use crate::master_kv_router::put::PutIDForAKey; use crate::p2p::msg_pack::{MsgPackSerializePart, RPCReq}; use crate::rpcresp_kvresult_convert::msg_and_error::ErrorCode; use bitcode::{Decode, Encode}; +use crate::cluster_manager::NodeIDString; use crate::memholder::ExternalMemHolderInfo; #[derive(Default, Debug, Clone, Encode, Decode)] @@ -89,6 +91,76 @@ impl MsgPackSerializePart for ExternalGetResp { } } +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdStageReadReq { + pub key: String, + pub put_id: PutIDForAKey, + pub get_id: u64, + pub stage_addr: u64, + pub stage_len: u64, + pub target_node_id: NodeIDString, + pub target_addr: u64, + pub len: u64, +} + +impl MsgPackSerializePart for SsdStageReadReq { + fn msg_id(&self) -> u32 { + 4020 + } +} + +impl RPCReq for SsdStageReadReq { + type Resp = SsdStageReadResp; +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdStageReadResp { + pub done_holder_id: u64, + pub done_allocation_mode: GetAllocationMode, + pub done_error_code: ErrorCode, + pub done_error_json: String, + pub done_server_process_us: i64, + pub error_code: ErrorCode, + pub error_json: String, +} + +impl MsgPackSerializePart for SsdStageReadResp { + fn msg_id(&self) -> u32 { + 4021 + } +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaPersistReq { + pub key: String, + pub put_id: PutIDForAKey, + pub target_addr: u64, + pub len: u64, +} + +impl MsgPackSerializePart for SsdReplicaPersistReq { + fn msg_id(&self) -> u32 { + 4022 + } +} + +impl RPCReq for SsdReplicaPersistReq { + type Resp = SsdReplicaPersistResp; +} + +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaPersistResp { + pub persisted: bool, + pub error_code: ErrorCode, + pub error_json: String, +} + +impl MsgPackSerializePart for SsdReplicaPersistResp { + fn msg_id(&self) -> u32 { + 4023 + } +} + // #[derive(Default, Debug, Clone, Encode, Decode)] // pub struct ExternalPutReq { // pub key: String, diff --git a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs index 1aa6954..8c7cc78 100644 --- a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs +++ b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs @@ -237,10 +237,7 @@ impl ClientSegPool { std::path::Path::new(share_mem_path).join(SIDE_TRANSFER_PEERS_DIRNAME) } - pub fn side_transfer_peer_file_path( - share_mem_path: &str, - side_id: &str, - ) -> std::path::PathBuf { + pub fn side_transfer_peer_file_path(share_mem_path: &str, side_id: &str) -> std::path::PathBuf { Self::side_transfer_peers_dir(share_mem_path).join(format!("{side_id}.json")) } @@ -399,17 +396,13 @@ impl ClientSegPool { crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed { path: String::new(), len: map_len as u64, - detail: "share_mem_path is empty; explicit configuration required" - .to_string(), + detail: "share_mem_path is empty; explicit configuration required".to_string(), }, )); } let base_path = &share_mem_path; - tracing::info!( - "Using share_mem_path: {} for memory-mapped file", - base_path - ); + tracing::info!("Using share_mem_path: {} for memory-mapped file", base_path); std::fs::create_dir_all(base_path).map_err(|e| { KvError::SharedMem( crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed { diff --git a/fluxon_rs/fluxon_kv/src/config.rs b/fluxon_rs/fluxon_kv/src/config.rs index f9c7691..02f6e3f 100644 --- a/fluxon_rs/fluxon_kv/src/config.rs +++ b/fluxon_rs/fluxon_kv/src/config.rs @@ -581,6 +581,8 @@ pub struct FluxonKvSpecYaml { #[serde(skip_serializing_if = "Option::is_none")] pub large_file_paths: Option, #[serde(skip_serializing_if = "Option::is_none")] + pub ssd_storage: Option>, + #[serde(skip_serializing_if = "Option::is_none")] pub p2p_listen_port: Option, #[serde(skip_serializing_if = "Option::is_none")] pub redis_compat: Option>, @@ -592,6 +594,17 @@ pub struct FluxonKvSpecYaml { #[serde(transparent)] pub struct LargeFilePathsYaml(pub Vec); +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct KvSsdStorageConfigYaml { + pub max_bytes: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct KvSsdStorageConfig { + pub max_bytes: u64, +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct RedisCompatConfigYaml { @@ -682,6 +695,34 @@ impl LargeFilePaths { .into_kverror()) } + fn resolve_all_usable_root_subdirs( + &self, + relative_dir: &Path, + target_name: &str, + ) -> KvResult> { + self.require_configured_paths()?; + let mut out = Vec::new(); + let mut errors = Vec::new(); + for root in &self.paths { + let candidate = Path::new(root).join(relative_dir); + match fs::create_dir_all(&candidate) { + Ok(()) => out.push(candidate), + Err(err) => errors.push(format!("{} ({})", candidate.display(), err)), + } + } + if out.is_empty() { + return Err(ConfigError::InvalidClientConfig { + detail: format!( + "large_file_paths contains no usable root for {}; tried: {}", + target_name, + errors.join(", ") + ), + } + .into_kverror()); + } + Ok(out) + } + pub fn kv_logs_dir(&self, cluster_name: &str) -> KvResult { let relative_dir = PathBuf::from(format!("{cluster_name}_cluster_kv_logs")); self.resolve_preferred_root_subdir(&relative_dir, "kv logs") @@ -714,6 +755,18 @@ impl LargeFilePaths { "fluxon fs disk cache", ) } + + pub fn kv_ssd_storage_dirs( + &self, + cluster_name: &str, + instance_key: &str, + ) -> KvResult> { + let relative_dir = PathBuf::from(format!( + "{cluster_name}_cluster_kv_ssd_storage/{}", + crate::kv_ssd_storage::safe_path_component(instance_key) + )); + self.resolve_all_usable_root_subdirs(&relative_dir, "kv ssd storage") + } } /// KV client backend types supported by the system @@ -733,8 +786,9 @@ pub struct ClientConfig { pub pprof_duration_seconds: Option, pub redis_compat_listen_addr: Option, pub fluxonkv_spec: FluxonKvSpec, - pub share_mem_path: String, // Mandatory shared bundle path + pub share_mem_path: String, // Mandatory shared bundle path pub large_file_paths: LargeFilePaths, // Mandatory large-file roots for logs and caches + pub ssd_storage: Option, pub test_spec_config: TestSpecConfig, } @@ -1028,6 +1082,13 @@ impl ClientConfigYaml { } .into_kverror()); } + if self.fluxonkv_spec.ssd_storage.is_some() { + return Err(ConfigError::InvalidClientConfig { + detail: "fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode" + .to_string(), + } + .into_kverror()); + } } // Preserve historical behavior for configs that omit `protocol`, but allow @@ -1170,13 +1231,15 @@ impl ClientConfigYaml { } else { let Some(large_file_paths_yaml) = self.fluxonkv_spec.large_file_paths.as_ref() else { return Err(ConfigError::InvalidClientConfig { - detail: "fluxonkv_spec.large_file_paths is required for owner mode" - .to_string(), + detail: "fluxonkv_spec.large_file_paths is required for owner mode".to_string(), } .into_kverror()); }; LargeFilePaths { - paths: verify_non_empty_root_path_list(&large_file_paths_yaml.0, "large_file_paths")?, + paths: verify_non_empty_root_path_list( + &large_file_paths_yaml.0, + "large_file_paths", + )?, } }; @@ -1204,6 +1267,28 @@ impl ClientConfigYaml { } }; + let ssd_storage = if is_external { + None + } else { + match std::mem::take(&mut self.fluxonkv_spec.ssd_storage) { + None | Some(YamlNullable::Null) => None, + Some(YamlNullable::Value(raw)) => { + if raw.max_bytes < crate::kv_ssd_storage::SSD_ALIGNMENT as u64 { + return Err(ConfigError::InvalidClientConfig { + detail: format!( + "fluxonkv_spec.ssd_storage.max_bytes must be >= {}", + crate::kv_ssd_storage::SSD_ALIGNMENT + ), + } + .into_kverror()); + } + Some(KvSsdStorageConfig { + max_bytes: raw.max_bytes, + }) + } + } + }; + Ok(ClientConfig { cluster_name: fluxonkv_spec.cluster_name.clone(), etcd_addresses_raw, @@ -1215,6 +1300,7 @@ impl ClientConfigYaml { fluxonkv_spec, share_mem_path, large_file_paths, + ssd_storage, test_spec_config, }) } @@ -1647,7 +1733,80 @@ fluxonkv_spec: .unwrap(); let err = cfg.verify().unwrap_err(); let text = format!("{err}"); - assert!(text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode")); + assert!( + text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode") + ); + } + + #[test] + fn client_config_owner_accepts_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_owner +contribute_to_cluster_pool_size: + dram: 16777216 + vram: {} +fluxonkv_spec: + etcd_addresses: ["127.0.0.1:2379"] + cluster_name: test_cluster + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] + ssd_storage: + max_bytes: 1048576 + sub_cluster: rack-a +"#, + ) + .unwrap(); + let verified = cfg.verify().unwrap(); + assert_eq!( + verified.ssd_storage.as_ref().map(|cfg| cfg.max_bytes), + Some(1048576) + ); + } + + #[test] + fn client_config_owner_rejects_too_small_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_owner +contribute_to_cluster_pool_size: + dram: 16777216 + vram: {} +fluxonkv_spec: + etcd_addresses: ["127.0.0.1:2379"] + cluster_name: test_cluster + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] + ssd_storage: + max_bytes: 1 + sub_cluster: rack-a +"#, + ) + .unwrap(); + let err = cfg.verify().unwrap_err(); + let text = format!("{err}"); + assert!( + text.contains("fluxonkv_spec.ssd_storage.max_bytes must be >= 512"), + "{text}" + ); + } + + #[test] + fn client_config_zero_contribution_rejects_ssd_storage() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_external +fluxonkv_spec: + cluster_name: test_cluster + share_mem_path: /tmp/test_external + ssd_storage: + max_bytes: 1048576 +"#, + ) + .unwrap(); + let err = cfg.verify().unwrap_err(); + let text = format!("{err}"); + assert!(text.contains("fluxonkv_spec.ssd_storage is forbidden in zero-contribution mode")); } #[test] @@ -1667,7 +1826,9 @@ fluxonkv_spec: let logs_dir = large_file_paths.kv_logs_dir("test_cluster").unwrap(); assert_eq!( logs_dir, - first_root.join("child").join("test_cluster_cluster_kv_logs") + first_root + .join("child") + .join("test_cluster_cluster_kv_logs") ); assert!(logs_dir.exists()); @@ -1683,6 +1844,32 @@ fluxonkv_spec: assert!(third_party_logs_dir.exists()); } + #[test] + fn large_file_paths_uses_all_usable_roots_for_kv_ssd_storage() { + let tempdir = new_test_dir("fluxon_large_paths_uses_all_usable_roots_for_kv_ssd_storage"); + let first_root = tempdir.join("first_root"); + let second_root = tempdir.join("second_root"); + + let large_file_paths = LargeFilePaths { + paths: vec![ + first_root.to_string_lossy().into_owned(), + second_root.to_string_lossy().into_owned(), + ], + }; + + let dirs = large_file_paths + .kv_ssd_storage_dirs("test_cluster", "owner/a:b") + .unwrap(); + assert_eq!( + dirs, + vec![ + first_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"), + second_root.join("test_cluster_cluster_kv_ssd_storage/owner_a_b"), + ] + ); + assert!(dirs.iter().all(|dir| dir.exists())); + } + #[test] fn client_test_spec_config_accepts_explicit_rdma_device_names() { let cfg = ClientConfigYaml::from_str( diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs index da701cd..630a8ea 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs @@ -89,6 +89,7 @@ fn new_client_config( large_file_paths: LargeFilePaths { paths: vec![format!("{}_large", shm_path)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } @@ -130,6 +131,7 @@ fn new_zero_contribution_client_config( }, share_mem_path: shm_path.to_string(), large_file_paths: LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs index 9cb291f..b7715dd 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs @@ -865,8 +865,7 @@ impl ExternalInner { return Ok(false); } - self.finish_owner_recover(&share_mem_path, payload) - .await?; + self.finish_owner_recover(&share_mem_path, payload).await?; Ok(true) } diff --git a/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs new file mode 100644 index 0000000..26d711e --- /dev/null +++ b/fluxon_rs/fluxon_kv/src/kv_ssd_storage.rs @@ -0,0 +1,2159 @@ +use crate::master_kv_router::put::PutIDForAKey; +use crate::rpcresp_kvresult_convert::msg_and_error::{ApiError, KvError, KvResult}; +use ::tokio::{ + sync::{Notify, mpsc as tokio_mpsc, oneshot}, + task, +}; +use futures::stream::{FuturesUnordered, StreamExt}; +use io_uring::{IoUring, opcode, types::Fd}; +use parking_lot::Mutex; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fs::{self, OpenOptions}; +use std::io; +use std::os::fd::{AsRawFd, RawFd}; +use std::os::unix::fs::MetadataExt; +use std::os::unix::fs::OpenOptionsExt; +use std::path::{Path, PathBuf}; +use std::ptr::NonNull; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::thread::JoinHandle; + +pub(crate) const SSD_ALIGNMENT: usize = 512; +const DEFAULT_SHARDS_PER_OWNER: usize = 4; +const DEFAULT_URING_THREADS: usize = 16; +const DEFAULT_URING_IO_DEPTH: usize = 128; +const DEFAULT_URING_READ_WEIGHT: usize = 2; +const DEFAULT_WRITE_QUEUE_DEPTH: usize = 8; +const DEFAULT_READ_QUEUE_DEPTH: usize = 16; +const DEFAULT_WRITE_INFLIGHT: usize = 2; +const DEFAULT_READ_INFLIGHT: usize = 16; +pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_CHUNK_BYTES: u64 = 4 * 1024 * 1024; +pub(crate) const DEFAULT_READ_TRANSFER_PIPELINE_INFLIGHT: usize = 4; + +#[derive(Clone, Debug)] +pub struct KvSsdStorageInit { + pub root_dirs: Vec, + pub max_bytes: u64, +} + +#[derive(Debug)] +pub struct KvSsdStorage { + root_dirs: Vec, + devices: Vec, + shard_to_device: Vec, + next_write_device: AtomicUsize, + inner: Arc>, + space_notify: Arc, +} + +#[derive(Debug)] +struct SsdDeviceWorker { + device_id: u64, + root_dir: PathBuf, + shard_ids: Vec, + _files: Vec, + _io: Arc, + write_tx: tokio_mpsc::Sender, + read_tx: tokio_mpsc::Sender, +} + +#[derive(Clone, Debug)] +struct SsdDeviceRoot { + device_id: u64, + root_dir: PathBuf, +} + +struct OpenedSsdShard { + shard_id: usize, + device_idx: usize, + file: std::fs::File, +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct SsdLoadedChunk { + pub offset: u64, + pub stage_addr: u64, + pub len: u64, +} + +#[derive(Debug)] +struct KvSsdStorageInner { + ring: SsdRingBuffer, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +struct KvSsdKey { + key: String, + put_id: PutIDForAKey, +} + +#[derive(Clone, Debug)] +struct SsdIndexEntry { + shard_id: usize, + begin: u64, + len: u64, + aligned_len: u64, + file_offset: u64, +} + +#[derive(Clone, Debug)] +struct SsdReadPinInfo { + entry: SsdIndexEntry, + count: usize, +} + +#[derive(Clone, Debug)] +enum SsdEntryState { + Writing(SsdIndexEntry), + Committed(SsdIndexEntry), +} + +impl SsdEntryState { + fn entry(&self) -> &SsdIndexEntry { + match self { + Self::Writing(entry) | Self::Committed(entry) => entry, + } + } +} + +#[derive(Debug)] +struct SsdShardRing { + capacity: u64, + head: u64, + tail: u64, + order: VecDeque, +} + +#[derive(Debug)] +struct SsdRingBuffer { + shards: Vec, + next_shard: usize, + entries: HashMap, + read_pins: HashMap, +} + +#[derive(Debug)] +enum SsdPreparedWrite { + Ready(SsdIndexEntry), + Existing, + BlockedByBusyIo, +} + +#[derive(Debug)] +enum SsdAllocation { + Ready { begin: u64, file_offset: u64 }, + BlockedByBusyIo, + TooLarge, +} + +impl SsdRingBuffer { + fn new(shard_capacities: Vec) -> Self { + assert!(!shard_capacities.is_empty()); + Self { + shards: shard_capacities + .into_iter() + .map(|capacity| SsdShardRing { + capacity, + head: 0, + tail: 0, + order: VecDeque::new(), + }) + .collect(), + next_shard: 0, + entries: HashMap::new(), + read_pins: HashMap::new(), + } + } + + #[cfg(test)] + fn get(&self, key: &KvSsdKey) -> Option { + match self.entries.get(key) { + Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => { + Some(entry.clone()) + } + _ => None, + } + } + + fn pin_read(&mut self, key: &KvSsdKey) -> Option { + let entry = match self.entries.get(key) { + Some(SsdEntryState::Committed(entry)) if self.is_offset_valid(entry) => entry.clone(), + _ => return None, + }; + let pin = self + .read_pins + .entry(key.clone()) + .or_insert_with(|| SsdReadPinInfo { + entry: entry.clone(), + count: 0, + }); + pin.count += 1; + Some(entry) + } + + fn unpin_read(&mut self, key: &KvSsdKey) { + match self.read_pins.get_mut(key) { + Some(pin) if pin.count > 1 => pin.count -= 1, + Some(_) => { + self.read_pins.remove(key); + } + None => debug_assert!(false, "missing kv ssd read pin for key={key:?}"), + } + } + + #[cfg(test)] + fn prepare_write(&mut self, key: KvSsdKey, len: u64) -> KvResult { + let allowed_shards = (0..self.shards.len()).collect::>(); + self.prepare_write_on_shards(key, len, &allowed_shards) + } + + fn prepare_write_on_shards( + &mut self, + key: KvSsdKey, + len: u64, + allowed_shards: &[usize], + ) -> KvResult { + if allowed_shards.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd device has no shards".to_string(), + })); + } + if self.entries.contains_key(&key) { + return Ok(SsdPreparedWrite::Existing); + } + let aligned_len = align_up_u64(len, SSD_ALIGNMENT as u64)?; + let max_capacity = self + .shards + .iter() + .enumerate() + .filter(|(idx, _)| allowed_shards.contains(idx)) + .map(|(_, shard)| shard.capacity) + .max() + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd device has invalid shard set: {allowed_shards:?}"), + }) + })?; + if aligned_len > max_capacity { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd value len={} aligned_len={} exceeds shard capacity={}", + len, aligned_len, max_capacity + ), + })); + } + + let shard_count = self.shards.len(); + for offset in 0..shard_count { + let shard_id = (self.next_shard + offset) % shard_count; + if !allowed_shards.contains(&shard_id) { + continue; + } + let (begin, file_offset) = match self.allocate_contiguous(shard_id, aligned_len) { + SsdAllocation::Ready { begin, file_offset } => (begin, file_offset), + SsdAllocation::BlockedByBusyIo => continue, + SsdAllocation::TooLarge => unreachable!("aligned_len was checked against capacity"), + }; + self.next_shard = (shard_id + 1) % shard_count; + + let entry = SsdIndexEntry { + shard_id, + begin, + len, + aligned_len, + file_offset, + }; + self.entries + .insert(key.clone(), SsdEntryState::Writing(entry.clone())); + self.shards[shard_id].order.push_back(key); + return Ok(SsdPreparedWrite::Ready(entry)); + } + + Ok(SsdPreparedWrite::BlockedByBusyIo) + } + + fn allocate_contiguous(&mut self, shard_id: usize, size: u64) -> SsdAllocation { + let shard = &self.shards[shard_id]; + if size > shard.capacity { + return SsdAllocation::TooLarge; + } + let capacity = shard.capacity; + let mut head = shard.head; + let phys = head % capacity; + let space_until_end = capacity - phys; + if size > space_until_end { + head += space_until_end; + } + let begin = head; + let new_head = head + size; + let new_tail = new_head.saturating_sub(capacity); + if self.has_busy_entries_before(shard_id, new_tail) { + return SsdAllocation::BlockedByBusyIo; + } + + self.shards[shard_id].head = new_head; + self.advance_tail(shard_id, new_tail); + SsdAllocation::Ready { + begin, + file_offset: begin % capacity, + } + } + + fn advance_tail(&mut self, shard_id: usize, new_tail: u64) { + if new_tail <= self.shards[shard_id].tail { + return; + } + debug_assert!(!self.has_busy_entries_before(shard_id, new_tail)); + self.shards[shard_id].tail = new_tail; + + while let Some(key) = self.shards[shard_id].order.front() { + match self.entries.get(key) { + Some(state) if state.entry().begin >= new_tail => break, + _ => { + let key = self.shards[shard_id] + .order + .pop_front() + .expect("front key exists"); + self.entries.remove(&key); + } + } + } + } + + fn commit(&mut self, key: &KvSsdKey, success: bool) -> bool { + let Some(state) = self.entries.get(key) else { + return false; + }; + let entry = match state { + SsdEntryState::Writing(entry) => entry.clone(), + SsdEntryState::Committed(_) => return true, + }; + if !self.is_offset_valid(&entry) || !success { + self.entries.remove(key); + return false; + } + self.entries + .insert(key.clone(), SsdEntryState::Committed(entry)); + true + } + + fn remove(&mut self, key: &KvSsdKey) { + self.entries.remove(key); + } + + fn is_offset_valid(&self, entry: &SsdIndexEntry) -> bool { + self.shards + .get(entry.shard_id) + .is_some_and(|shard| entry.begin >= shard.tail) + } + + fn has_busy_entries_before(&self, shard_id: usize, new_tail: u64) -> bool { + if new_tail <= self.shards[shard_id].tail { + return false; + } + let writing_busy = self.entries.values().any(|state| match state { + SsdEntryState::Writing(entry) => entry.shard_id == shard_id && entry.begin < new_tail, + SsdEntryState::Committed(_) => false, + }); + if writing_busy { + return true; + } + self.read_pins + .values() + .any(|pin| pin.entry.shard_id == shard_id && pin.entry.begin < new_tail) + } +} + +struct SsdReadPin { + inner: Arc>, + space_notify: Arc, + key: KvSsdKey, +} + +impl Drop for SsdReadPin { + fn drop(&mut self) { + self.inner.lock().ring.unpin_read(&self.key); + self.space_notify.notify_one(); + } +} + +struct WriteCommand { + key: KvSsdKey, + entry_len: u64, + data: AlignedBuffer, + done_tx: oneshot::Sender>, +} + +struct ReadCommand { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +struct WriteTask { + key: KvSsdKey, + entry: SsdIndexEntry, + data: AlignedBuffer, + done_tx: oneshot::Sender>, +} + +struct ReadTask { + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +struct WriteCompletion { + key: KvSsdKey, + success: bool, + result: KvResult<()>, + done_tx: oneshot::Sender>, +} + +struct ReadCompletion { + key: KvSsdKey, + entry: SsdIndexEntry, + result: KvResult, + _read_pin: Option, + done_tx: oneshot::Sender>, +} + +enum ReadTarget { + Scratch(AlignedBuffer), + Direct { target_addr: u64, len: usize }, +} + +enum ReadOutput { + Scratch(AlignedBuffer), + Direct, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum SsdReadPath { + Scratch, + Direct, +} + +pub fn safe_path_component(raw: &str) -> String { + let mut out = String::with_capacity(raw.len().max(1)); + for ch in raw.chars() { + if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') { + out.push(ch); + } else { + out.push('_'); + } + } + if out.is_empty() { + "unnamed".to_string() + } else { + out + } +} + +impl KvSsdStorage { + pub fn new(init: KvSsdStorageInit) -> KvResult { + if init.max_bytes < SSD_ALIGNMENT as u64 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd storage max_bytes must be >= {}", SSD_ALIGNMENT), + })); + } + if init.root_dirs.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + + let device_roots = deduplicate_device_roots(&init.root_dirs)?; + let effective_root_dirs = device_roots + .iter() + .map(|root| root.root_dir.clone()) + .collect::>(); + let shard_count = choose_shard_count(init.max_bytes, device_roots.len()); + let shard_capacity = aligned_shard_capacity(init.max_bytes, shard_count)?; + let opened_shards = open_cache_files(&device_roots, shard_count, shard_capacity)?; + let inner = Arc::new(Mutex::new(KvSsdStorageInner { + ring: SsdRingBuffer::new(vec![shard_capacity; shard_count]), + })); + let space_notify = Arc::new(Notify::new()); + let mut shard_to_device = vec![0usize; shard_count]; + let mut device_shards = device_roots + .iter() + .map(|root| (root.clone(), Vec::<(usize, std::fs::File)>::new())) + .collect::>(); + for opened in opened_shards { + shard_to_device[opened.shard_id] = opened.device_idx; + device_shards[opened.device_idx] + .1 + .push((opened.shard_id, opened.file)); + } + + let mut devices = Vec::with_capacity(device_shards.len()); + for (device_root, shard_files) in device_shards { + let shard_ids = shard_files + .iter() + .map(|(shard_id, _)| *shard_id) + .collect::>(); + let fds = shard_files + .iter() + .map(|(shard_id, file)| (*shard_id, file.as_raw_fd())) + .collect::>(); + let io = Arc::new(UringIoEngine::new_multi( + fds, + UringConfig { + threads: DEFAULT_URING_THREADS, + io_depth: DEFAULT_URING_IO_DEPTH, + }, + )?); + let (write_tx, write_rx) = tokio_mpsc::channel(DEFAULT_WRITE_QUEUE_DEPTH); + let (read_tx, read_rx) = tokio_mpsc::channel(DEFAULT_READ_QUEUE_DEPTH); + + task::spawn(ssd_writer_loop( + Arc::clone(&inner), + write_rx, + Arc::clone(&io), + Arc::clone(&space_notify), + DEFAULT_WRITE_INFLIGHT, + shard_ids.clone(), + )); + task::spawn(ssd_reader_loop( + Arc::clone(&inner), + read_rx, + Arc::clone(&io), + DEFAULT_READ_INFLIGHT, + )); + + devices.push(SsdDeviceWorker { + device_id: device_root.device_id, + root_dir: device_root.root_dir, + shard_ids, + _files: shard_files + .into_iter() + .map(|(_, file)| file) + .collect::>(), + _io: io, + write_tx, + read_tx, + }); + } + + Ok(Self { + root_dirs: effective_root_dirs, + devices, + shard_to_device, + next_write_device: AtomicUsize::new(0), + inner, + space_notify, + }) + } + + pub fn root_dirs(&self) -> &[PathBuf] { + &self.root_dirs + } + + fn next_write_tx(&self) -> KvResult> { + if self.devices.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage has no active device".to_string(), + })); + } + let idx = self.next_write_device.fetch_add(1, Ordering::Relaxed) % self.devices.len(); + Ok(self.devices[idx].write_tx.clone()) + } + + fn read_tx_for_shard(&self, shard_id: usize) -> KvResult> { + let Some(device_idx) = self.shard_to_device.get(shard_id).copied() else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd invalid shard id for read: {}", shard_id), + })); + }; + let Some(device) = self.devices.get(device_idx) else { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd invalid device index for read: shard_id={} device_idx={}", + shard_id, device_idx + ), + })); + }; + if !device.shard_ids.contains(&shard_id) { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd shard/device route mismatch: shard_id={} device_idx={} device_id={} root_dir={}", + shard_id, + device_idx, + device.device_id, + device.root_dir.display() + ), + })); + } + Ok(device.read_tx.clone()) + } + + pub async fn persist_from_addr( + &self, + key: &str, + put_id: PutIDForAKey, + addr: u64, + len: u64, + ) -> KvResult<()> { + validate_key(key)?; + let len_usize = usize::try_from(len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd persist len does not fit usize: {}", len), + }) + })?; + let aligned_len = align_up_usize(len_usize, SSD_ALIGNMENT)?; + let data = unsafe { AlignedBuffer::copy_from_addr(addr, len_usize, aligned_len)? }; + self.persist_buffer(key, put_id, len, data).await + } + + pub async fn persist(&self, key: &str, put_id: PutIDForAKey, data: &[u8]) -> KvResult<()> { + validate_key(key)?; + let aligned_len = align_up_usize(data.len(), SSD_ALIGNMENT)?; + let mut buffer = AlignedBuffer::zeroed(aligned_len)?; + unsafe { + std::ptr::copy_nonoverlapping(data.as_ptr(), buffer.as_mut_ptr(), data.len()); + } + self.persist_buffer(key, put_id, data.len() as u64, buffer) + .await + } + + async fn persist_buffer( + &self, + key: &str, + put_id: PutIDForAKey, + entry_len: u64, + data: AlignedBuffer, + ) -> KvResult<()> { + let (done_tx, done_rx) = oneshot::channel(); + let write_tx = self.next_write_tx()?; + write_tx + .send(WriteCommand { + key: KvSsdKey { + key: key.to_string(), + put_id, + }, + entry_len, + data, + done_tx, + }) + .await + .map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd write queue closed: {}", err), + }) + })?; + done_rx.await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd write completion closed: {}", err), + }) + })? + } + + pub async fn load_into_addr( + &self, + key: &str, + put_id: PutIDForAKey, + target_addr: u64, + len: u64, + target_len: u64, + ) -> KvResult<()> { + validate_key(key)?; + if target_len < len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd target capacity too small for key={} put_id=({},{}) len={} target_len={}", + key, put_id.0, put_id.1, len, target_len + ), + })); + } + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + let (entry, read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { + key: key.key.clone(), + })); + }; + ( + entry, + SsdReadPin { + inner: Arc::clone(&self.inner), + space_notify: Arc::clone(&self.space_notify), + key: key.clone(), + }, + ) + }; + if entry.len != len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd length mismatch for key={} put_id=({},{}) expected={} actual={}", + key.key, put_id.0, put_id.1, len, entry.len + ), + })); + } + + let len_usize = usize::try_from(len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd load len does not fit usize: {}", len), + }) + })?; + let aligned_len_usize = usize::try_from(entry.aligned_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd aligned load len does not fit usize: {}", + entry.aligned_len + ), + }) + })?; + let target = match choose_read_path(&entry, target_addr, len, target_len) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr, + len: aligned_len_usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(aligned_len_usize)?), + }; + let output = self + .submit_read_command( + key, + entry.clone(), + entry.file_offset, + target, + Some(read_pin), + ) + .await?; + if let ReadOutput::Scratch(buffer) = output { + unsafe { + std::ptr::copy_nonoverlapping(buffer.as_ptr(), target_addr as *mut u8, len_usize); + } + } + Ok(()) + } + + pub(crate) async fn load_into_addr_chunks( + &self, + key: &str, + put_id: PutIDForAKey, + target_addr: u64, + len: u64, + target_len: u64, + chunk_bytes: u64, + max_read_inflight: usize, + ready_tx: tokio_mpsc::Sender, + ) -> KvResult<()> { + validate_key(key)?; + if target_len < len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd target capacity too small for chunked load: key={} put_id=({},{}) len={} target_len={}", + key, put_id.0, put_id.1, len, target_len + ), + })); + } + let chunk_bytes = align_up_u64(chunk_bytes.max(1), SSD_ALIGNMENT as u64)?; + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + let (entry, _read_pin) = { + let mut inner = self.inner.lock(); + let Some(entry) = inner.ring.pin_read(&key) else { + return Err(KvError::Api(ApiError::KeyNotFound { + key: key.key.clone(), + })); + }; + ( + entry, + SsdReadPin { + inner: Arc::clone(&self.inner), + space_notify: Arc::clone(&self.space_notify), + key: key.clone(), + }, + ) + }; + if entry.len != len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd length mismatch for chunked load: key={} put_id=({},{}) expected={} actual={}", + key.key, put_id.0, put_id.1, len, entry.len + ), + })); + } + + let mut next_offset = 0u64; + let mut inflight = FuturesUnordered::new(); + let max_read_inflight = max_read_inflight.max(1); + + loop { + while next_offset < len && inflight.len() < max_read_inflight { + let payload_len = chunk_bytes.min(len - next_offset); + let stage_addr = checked_add_u64(target_addr, next_offset, "chunk stage addr")?; + let remaining_target_len = target_len - next_offset; + inflight.push(self.load_entry_range_into_addr( + key.clone(), + entry.clone(), + next_offset, + payload_len, + stage_addr, + remaining_target_len, + )); + next_offset += payload_len; + } + + let Some(chunk) = inflight.next().await else { + break; + }; + let chunk = chunk?; + ready_tx.send(chunk).await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd chunk ready queue closed: {}", err), + }) + })?; + } + Ok(()) + } + + async fn load_entry_range_into_addr( + &self, + key: KvSsdKey, + entry: SsdIndexEntry, + offset: u64, + payload_len: u64, + target_addr: u64, + target_len: u64, + ) -> KvResult { + if payload_len == 0 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd chunk payload len must be positive".to_string(), + })); + } + let payload_end = checked_add_u64(offset, payload_len, "chunk payload end")?; + if payload_end > entry.len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk exceeds entry len: offset={} len={} entry_len={}", + offset, payload_len, entry.len + ), + })); + } + let read_len = align_up_u64(payload_len, SSD_ALIGNMENT as u64)?; + let read_end = checked_add_u64(offset, read_len, "chunk read end")?; + if read_end > entry.aligned_len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd aligned chunk exceeds entry aligned len: offset={} read_len={} aligned_len={}", + offset, read_len, entry.aligned_len + ), + })); + } + if target_len < read_len { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk target capacity too small: offset={} read_len={} target_len={}", + offset, read_len, target_len + ), + })); + } + let file_offset = checked_add_u64(entry.file_offset, offset, "chunk file offset")?; + let read_len_usize = usize::try_from(read_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd chunk read len does not fit usize: {}", read_len), + }) + })?; + let payload_len_usize = usize::try_from(payload_len).map_err(|_| { + KvError::Api(ApiError::InvalidArgument { + detail: format!( + "kv ssd chunk payload len does not fit usize: {}", + payload_len + ), + }) + })?; + let target = match choose_chunk_read_path(target_addr, read_len, target_len, file_offset) { + SsdReadPath::Direct => ReadTarget::Direct { + target_addr, + len: read_len_usize, + }, + SsdReadPath::Scratch => ReadTarget::Scratch(AlignedBuffer::zeroed(read_len_usize)?), + }; + let output = self + .submit_read_command(key, entry, file_offset, target, None) + .await?; + if let ReadOutput::Scratch(buffer) = output { + unsafe { + std::ptr::copy_nonoverlapping( + buffer.as_ptr(), + target_addr as *mut u8, + payload_len_usize, + ); + } + } + Ok(SsdLoadedChunk { + offset, + stage_addr: target_addr, + len: payload_len, + }) + } + + async fn submit_read_command( + &self, + key: KvSsdKey, + entry: SsdIndexEntry, + file_offset: u64, + target: ReadTarget, + read_pin: Option, + ) -> KvResult { + let (done_tx, done_rx) = oneshot::channel(); + let read_tx = self.read_tx_for_shard(entry.shard_id)?; + read_tx + .send(ReadCommand { + key, + entry, + file_offset, + target, + _read_pin: read_pin, + done_tx, + }) + .await + .map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd read queue closed: {}", err), + }) + })?; + done_rx.await.map_err(|err| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd read completion closed: {}", err), + }) + })? + } + + #[cfg(test)] + async fn has_entry(&self, key: &str, put_id: PutIDForAKey) -> bool { + let key = KvSsdKey { + key: key.to_string(), + put_id, + }; + self.inner.lock().ring.get(&key).is_some() + } +} + +async fn ssd_writer_loop( + inner: Arc>, + mut rx: tokio_mpsc::Receiver, + io: Arc, + space_notify: Arc, + write_inflight: usize, + shard_ids: Vec, +) { + let mut pending: VecDeque = VecDeque::new(); + let mut inflight = FuturesUnordered::new(); + let max_inflight = write_inflight.max(1); + + loop { + while inflight.len() < max_inflight { + let Some(cmd) = pending.pop_front() else { + break; + }; + let prepared = { + let mut inner = inner.lock(); + inner + .ring + .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids) + }; + match prepared { + Ok(SsdPreparedWrite::Ready(entry)) => { + inflight.push(execute_write( + WriteTask { + key: cmd.key, + entry, + data: cmd.data, + done_tx: cmd.done_tx, + }, + Arc::clone(&io), + )); + } + Ok(SsdPreparedWrite::Existing) => { + let _ = cmd.done_tx.send(Ok(())); + } + Ok(SsdPreparedWrite::BlockedByBusyIo) => { + pending.push_front(cmd); + break; + } + Err(err) => { + let _ = cmd.done_tx.send(Err(err)); + } + } + } + + tokio::select! { + Some(completion) = inflight.next(), if !inflight.is_empty() => { + finish_write_completion(&inner, &space_notify, completion); + } + Some(cmd) = rx.recv() => { + pending.push_back(cmd); + } + _ = space_notify.notified(), if !pending.is_empty() => { + // Retry pending commands after an active read/write releases a ring position. + } + else => { + if pending.is_empty() && inflight.is_empty() { + break; + } + }, + } + } + + while !pending.is_empty() || !inflight.is_empty() { + while inflight.len() < max_inflight { + let Some(cmd) = pending.pop_front() else { + break; + }; + let prepared = { + let mut inner = inner.lock(); + inner + .ring + .prepare_write_on_shards(cmd.key.clone(), cmd.entry_len, &shard_ids) + }; + match prepared { + Ok(SsdPreparedWrite::Ready(entry)) => { + inflight.push(execute_write( + WriteTask { + key: cmd.key, + entry, + data: cmd.data, + done_tx: cmd.done_tx, + }, + Arc::clone(&io), + )); + } + Ok(SsdPreparedWrite::Existing) => { + let _ = cmd.done_tx.send(Ok(())); + } + Ok(SsdPreparedWrite::BlockedByBusyIo) => { + pending.push_front(cmd); + break; + } + Err(err) => { + let _ = cmd.done_tx.send(Err(err)); + } + } + } + + if let Some(completion) = inflight.next().await { + finish_write_completion(&inner, &space_notify, completion); + } else if !pending.is_empty() { + space_notify.notified().await; + } + } +} + +fn finish_write_completion( + inner: &Arc>, + space_notify: &Notify, + completion: WriteCompletion, +) { + let committed = inner + .lock() + .ring + .commit(&completion.key, completion.success); + space_notify.notify_one(); + let result = if completion.success && !committed { + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + } else { + completion.result + }; + let _ = completion.done_tx.send(result); +} + +async fn execute_write(task: WriteTask, io: Arc) -> WriteCompletion { + let WriteTask { + key, + entry, + data, + done_tx, + } = task; + let data_len = data.len(); + let shard_id = entry.shard_id; + let file_offset = entry.file_offset; + let result = async move { + let rx = { + let data_ptr = data.as_ptr(); + io.writev_at_async(shard_id, vec![(data_ptr, data_len)], file_offset)? + }; + let written = rx + .await + .map_err(|_| io::Error::other("kv ssd write completion dropped"))??; + if written != data_len { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + format!("short kv ssd write: {} != {}", written, data_len), + ) + .into()); + } + Ok(()) + } + .await; + let result = result.map_err(|err| file_error_for_entry(&key, file_offset, err)); + WriteCompletion { + key, + success: result.is_ok(), + result, + done_tx, + } +} + +async fn ssd_reader_loop( + inner: Arc>, + mut rx: tokio_mpsc::Receiver, + io: Arc, + read_inflight: usize, +) { + let mut pending = VecDeque::new(); + let mut inflight = FuturesUnordered::new(); + let max_inflight = read_inflight.max(1); + + loop { + while inflight.len() < max_inflight { + let Some(task) = pending.pop_front() else { + break; + }; + inflight.push(execute_read(task, Arc::clone(&io))); + } + + tokio::select! { + Some(completion) = inflight.next(), if !inflight.is_empty() => { + let valid = inner.lock().ring.is_offset_valid(&completion.entry); + let result = if valid { + completion.result + } else { + inner.lock().ring.remove(&completion.key); + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + }; + let _ = completion.done_tx.send(result); + } + Some(cmd) = rx.recv() => { + pending.push_back(ReadTask { + key: cmd.key, + entry: cmd.entry, + file_offset: cmd.file_offset, + target: cmd.target, + _read_pin: cmd._read_pin, + done_tx: cmd.done_tx, + }); + } + else => break, + } + } + + while let Some(completion) = inflight.next().await { + let valid = inner.lock().ring.is_offset_valid(&completion.entry); + let result = if valid { + completion.result + } else { + inner.lock().ring.remove(&completion.key); + Err(KvError::Api(ApiError::KeyNotFound { + key: completion.key.key.clone(), + })) + }; + let _ = completion.done_tx.send(result); + } +} + +async fn execute_read(task: ReadTask, io: Arc) -> ReadCompletion { + let ReadTask { + key, + entry, + file_offset, + target, + _read_pin, + done_tx, + } = task; + let shard_id = entry.shard_id; + let result = async move { + match target { + ReadTarget::Scratch(mut buffer) => { + let buffer_len = buffer.len(); + let rx = { + let buffer_ptr = buffer.as_mut_ptr(); + io.readv_at_async(shard_id, vec![(buffer_ptr, buffer_len)], file_offset)? + }; + let read = rx + .await + .map_err(|_| io::Error::other("kv ssd read completion dropped"))??; + if read != buffer_len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("short kv ssd read: {} != {}", read, buffer_len), + )); + } + Ok(ReadOutput::Scratch(buffer)) + } + ReadTarget::Direct { target_addr, len } => { + let rx = + io.readv_at_async(shard_id, vec![(target_addr as *mut u8, len)], file_offset)?; + let read = rx + .await + .map_err(|_| io::Error::other("kv ssd read completion dropped"))??; + if read != len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!("short kv ssd direct read: {} != {}", read, len), + )); + } + Ok(ReadOutput::Direct) + } + } + } + .await + .map_err(|err| file_error_for_entry(&key, file_offset, err)); + ReadCompletion { + key, + entry, + result, + _read_pin, + done_tx, + } +} + +#[derive(Clone, Copy)] +struct UringConfig { + threads: usize, + io_depth: usize, +} + +#[derive(Clone, Copy)] +enum IoType { + Readv, + Writev, +} + +struct IoCtx { + io_type: IoType, + fd: RawFd, + len: usize, + offset: u64, + complete: oneshot::Sender>, + iovecs: Box<[libc::iovec]>, +} + +unsafe impl Send for IoCtx {} + +struct UringShard { + read_rx: crossbeam::channel::Receiver, + write_rx: crossbeam::channel::Receiver, + uring: IoUring, + io_depth: usize, + read_weight: usize, +} + +impl UringShard { + fn run(mut self) { + let mut read_inflight = 0usize; + let mut write_inflight = 0usize; + let mut read_closed = false; + let mut write_closed = false; + + loop { + let mut inflight = read_inflight + write_inflight; + while inflight < self.io_depth && !(read_closed && write_closed) { + let next = self.try_recv_weighted( + &mut read_closed, + &mut write_closed, + read_inflight, + write_inflight, + ); + let Some(ctx) = next else { + break; + }; + self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight); + inflight = read_inflight + write_inflight; + } + + if read_closed && write_closed && inflight == 0 { + return; + } + if inflight == 0 { + let Some(ctx) = self.recv_blocking(&mut read_closed, &mut write_closed) else { + continue; + }; + self.submit_ctx(ctx, &mut read_inflight, &mut write_inflight); + continue; + } + if let Err(err) = self.uring.submit_and_wait(1) { + while let Some(cqe) = self.uring.completion().next() { + let data = cqe.user_data(); + if data != 0 { + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + let _ = ctx.complete.send(Err(io::Error::other(format!( + "io_uring submit failed: {err}" + )))); + } + } + return; + } + + for cqe in self.uring.completion() { + let data = cqe.user_data(); + if data == 0 { + continue; + } + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + match ctx.io_type { + IoType::Readv => read_inflight = read_inflight.saturating_sub(1), + IoType::Writev => write_inflight = write_inflight.saturating_sub(1), + } + let res = cqe.result(); + let send_res = if res < 0 { + Err(io::Error::from_raw_os_error(-res)) + } else { + Ok(res as usize) + }; + let _ = ctx.complete.send(send_res); + } + } + } + + fn try_recv_weighted( + &self, + read_closed: &mut bool, + write_closed: &mut bool, + read_inflight: usize, + write_inflight: usize, + ) -> Option { + let prefer_read = read_inflight <= write_inflight.saturating_mul(self.read_weight); + if prefer_read { + self.try_recv_read(read_closed) + .or_else(|| self.try_recv_write(write_closed)) + } else { + self.try_recv_write(write_closed) + .or_else(|| self.try_recv_read(read_closed)) + } + } + + fn try_recv_read(&self, read_closed: &mut bool) -> Option { + if *read_closed { + return None; + } + match self.read_rx.try_recv() { + Ok(ctx) => Some(ctx), + Err(crossbeam::channel::TryRecvError::Empty) => None, + Err(crossbeam::channel::TryRecvError::Disconnected) => { + *read_closed = true; + None + } + } + } + + fn try_recv_write(&self, write_closed: &mut bool) -> Option { + if *write_closed { + return None; + } + match self.write_rx.try_recv() { + Ok(ctx) => Some(ctx), + Err(crossbeam::channel::TryRecvError::Empty) => None, + Err(crossbeam::channel::TryRecvError::Disconnected) => { + *write_closed = true; + None + } + } + } + + fn recv_blocking(&self, read_closed: &mut bool, write_closed: &mut bool) -> Option { + loop { + match (!*read_closed, !*write_closed) { + (true, true) => { + crossbeam::channel::select! { + recv(self.read_rx) -> msg => match msg { + Ok(ctx) => return Some(ctx), + Err(_) => *read_closed = true, + }, + recv(self.write_rx) -> msg => match msg { + Ok(ctx) => return Some(ctx), + Err(_) => *write_closed = true, + }, + } + } + (true, false) => match self.read_rx.recv() { + Ok(ctx) => return Some(ctx), + Err(_) => *read_closed = true, + }, + (false, true) => match self.write_rx.recv() { + Ok(ctx) => return Some(ctx), + Err(_) => *write_closed = true, + }, + (false, false) => return None, + } + } + } + + fn submit_ctx(&mut self, ctx: IoCtx, read_inflight: &mut usize, write_inflight: &mut usize) { + let fd = Fd(ctx.fd); + let iovecs_ptr = ctx.iovecs.as_ptr(); + let sqe = match ctx.io_type { + IoType::Readv => opcode::Readv::new(fd, iovecs_ptr, ctx.len as _) + .offset(ctx.offset) + .build(), + IoType::Writev => opcode::Writev::new(fd, iovecs_ptr, ctx.len as _) + .offset(ctx.offset) + .build(), + }; + let io_type = ctx.io_type; + let data = Box::into_raw(Box::new(ctx)) as u64; + let sqe = sqe.user_data(data); + let push_result = unsafe { self.uring.submission().push(&sqe) }; + if push_result.is_err() { + let ctx = unsafe { Box::from_raw(data as *mut IoCtx) }; + let _ = ctx + .complete + .send(Err(io::Error::other("submission queue full"))); + return; + } + match io_type { + IoType::Readv => *read_inflight += 1, + IoType::Writev => *write_inflight += 1, + } + } +} + +#[derive(Debug)] +struct UringIoEngine { + fds: HashMap, + read_txs: Vec>, + write_txs: Vec>, + handles: Vec>, +} + +impl UringIoEngine { + fn new_multi(shard_fds: Vec<(usize, RawFd)>, cfg: UringConfig) -> io::Result { + if cfg.threads == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "threads must be > 0", + )); + } + if shard_fds.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "at least one fd is required", + )); + } + let fds = shard_fds.into_iter().collect::>(); + let mut read_txs = Vec::with_capacity(cfg.threads); + let mut write_txs = Vec::with_capacity(cfg.threads); + let mut handles = Vec::with_capacity(cfg.threads); + for idx in 0..cfg.threads { + let (read_tx, read_rx) = crossbeam::channel::bounded(cfg.io_depth * 2); + let (write_tx, write_rx) = crossbeam::channel::bounded(cfg.io_depth * 2); + let uring = IoUring::builder().build(cfg.io_depth as u32)?; + let handle = std::thread::Builder::new() + .name(format!("fluxon-kv-ssd-uring-{idx}")) + .spawn(move || { + UringShard { + read_rx, + write_rx, + uring, + io_depth: cfg.io_depth, + read_weight: DEFAULT_URING_READ_WEIGHT, + } + .run() + })?; + read_txs.push(read_tx); + write_txs.push(write_tx); + handles.push(handle); + } + Ok(Self { + fds, + read_txs, + write_txs, + handles, + }) + } + + fn readv_at_async( + &self, + shard_id: usize, + iovecs: Vec<(*mut u8, usize)>, + offset: u64, + ) -> io::Result>> { + self.submit_iovecs(IoType::Readv, shard_id, iovecs, offset) + } + + fn writev_at_async( + &self, + shard_id: usize, + iovecs: Vec<(*const u8, usize)>, + offset: u64, + ) -> io::Result>> { + let iovecs = iovecs + .into_iter() + .map(|(ptr, len)| (ptr as *mut u8, len)) + .collect(); + self.submit_iovecs(IoType::Writev, shard_id, iovecs, offset) + } + + fn submit_iovecs( + &self, + io_type: IoType, + shard_id: usize, + iovecs: Vec<(*mut u8, usize)>, + offset: u64, + ) -> io::Result>> { + if iovecs.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "readv/writev requires at least one iovec", + )); + } + validate_direct_io( + iovecs.iter().map(|(ptr, len)| (*ptr as usize, *len)), + offset, + )?; + let iovecs_libc = iovecs + .iter() + .map(|(ptr, len)| libc::iovec { + iov_base: *ptr as *mut libc::c_void, + iov_len: *len, + }) + .collect::>() + .into_boxed_slice(); + let (tx, rx) = oneshot::channel(); + let ctx = IoCtx { + io_type, + fd: self.fd(shard_id)?, + len: iovecs_libc.len(), + offset, + complete: tx, + iovecs: iovecs_libc, + }; + self.pick_tx(io_type, shard_id).send(ctx).map_err(|err| { + io::Error::new( + io::ErrorKind::BrokenPipe, + format!("io_uring send failed: {}", err), + ) + })?; + Ok(rx) + } + + fn fd(&self, shard_id: usize) -> io::Result { + self.fds.get(&shard_id).copied().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid SSD shard id {shard_id}"), + ) + }) + } + + fn pick_tx(&self, io_type: IoType, shard_id: usize) -> &crossbeam::channel::Sender { + match io_type { + IoType::Readv => &self.read_txs[shard_id % self.read_txs.len()], + IoType::Writev => &self.write_txs[shard_id % self.write_txs.len()], + } + } +} + +impl Drop for UringIoEngine { + fn drop(&mut self) { + self.read_txs.clear(); + self.write_txs.clear(); + for handle in self.handles.drain(..) { + let _ = handle.join(); + } + } +} + +struct AlignedBuffer { + ptr: NonNull, + len: usize, +} + +unsafe impl Send for AlignedBuffer {} + +impl AlignedBuffer { + fn zeroed(len: usize) -> KvResult { + if len == 0 || !len.is_multiple_of(SSD_ALIGNMENT) { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!( + "aligned buffer len must be positive and {}-byte aligned: {}", + SSD_ALIGNMENT, len + ), + })); + } + let mut raw = std::ptr::null_mut(); + let rc = unsafe { libc::posix_memalign(&mut raw, SSD_ALIGNMENT, len) }; + if rc != 0 || raw.is_null() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: format!("posix_memalign failed with rc={}", rc), + })); + } + unsafe { + std::ptr::write_bytes(raw as *mut u8, 0, len); + } + Ok(Self { + ptr: NonNull::new(raw as *mut u8).expect("posix_memalign returned non-null"), + len, + }) + } + + unsafe fn copy_from_addr(addr: u64, actual_len: usize, aligned_len: usize) -> KvResult { + let mut buffer = Self::zeroed(aligned_len)?; + unsafe { + std::ptr::copy_nonoverlapping(addr as *const u8, buffer.as_mut_ptr(), actual_len); + } + Ok(buffer) + } + + fn as_ptr(&self) -> *const u8 { + self.ptr.as_ptr() + } + + fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.as_ptr() + } + + fn len(&self) -> usize { + self.len + } +} + +impl Drop for AlignedBuffer { + fn drop(&mut self) { + unsafe { + libc::free(self.ptr.as_ptr() as *mut libc::c_void); + } + } +} + +fn validate_key(key: &str) -> KvResult<()> { + if key.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage key must be non-empty".to_string(), + })); + } + Ok(()) +} + +fn choose_read_path( + entry: &SsdIndexEntry, + target_addr: u64, + len: u64, + target_len: u64, +) -> SsdReadPath { + if len == 0 || entry.len != len { + return SsdReadPath::Scratch; + } + if target_addr.is_multiple_of(SSD_ALIGNMENT as u64) + && target_len >= entry.aligned_len + && entry.file_offset.is_multiple_of(SSD_ALIGNMENT as u64) + { + SsdReadPath::Direct + } else { + SsdReadPath::Scratch + } +} + +fn choose_chunk_read_path( + target_addr: u64, + read_len: u64, + target_len: u64, + file_offset: u64, +) -> SsdReadPath { + if read_len != 0 + && target_addr.is_multiple_of(SSD_ALIGNMENT as u64) + && read_len.is_multiple_of(SSD_ALIGNMENT as u64) + && target_len >= read_len + && file_offset.is_multiple_of(SSD_ALIGNMENT as u64) + { + SsdReadPath::Direct + } else { + SsdReadPath::Scratch + } +} + +fn choose_shard_count(max_bytes: u64, root_count: usize) -> usize { + let max_aligned_shards = (max_bytes / SSD_ALIGNMENT as u64).max(1) as usize; + DEFAULT_SHARDS_PER_OWNER + .max(root_count) + .min(max_aligned_shards) + .max(1) +} + +fn aligned_shard_capacity(capacity_bytes: u64, shard_count: usize) -> KvResult { + let raw = capacity_bytes / shard_count as u64; + let capacity = raw / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64; + if capacity == 0 { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage capacity is too small for shard count".to_string(), + })); + } + Ok(capacity) +} + +fn deduplicate_device_roots(root_dirs: &[PathBuf]) -> KvResult> { + if root_dirs.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + let mut seen_devices = HashSet::new(); + let mut device_roots = Vec::new(); + for root_dir in root_dirs { + fs::create_dir_all(root_dir).map_err(|err| file_error(root_dir, 0, err))?; + let metadata = fs::metadata(root_dir).map_err(|err| file_error(root_dir, 0, err))?; + let device_id = metadata.dev(); + if seen_devices.insert(device_id) { + device_roots.push(SsdDeviceRoot { + device_id, + root_dir: root_dir.clone(), + }); + } + } + if device_roots.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs contains no usable device".to_string(), + })); + } + Ok(device_roots) +} + +fn open_cache_files( + device_roots: &[SsdDeviceRoot], + shard_count: usize, + shard_capacity: u64, +) -> KvResult> { + if device_roots.is_empty() { + return Err(KvError::Api(ApiError::InvalidArgument { + detail: "kv ssd storage root_dirs must contain at least one path".to_string(), + })); + } + let mut files = Vec::with_capacity(shard_count); + for shard_id in 0..shard_count { + let device_idx = shard_id % device_roots.len(); + let root_dir = &device_roots[device_idx].root_dir; + let shards_dir = root_dir.join("shards"); + fs::create_dir_all(&shards_dir).map_err(|err| file_error(&shards_dir, 0, err))?; + let path = shards_dir.join(format!("shard-{shard_id:06}.dat")); + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(&path) + .map_err(|err| file_error(&path, 0, err))?; + file.set_len(shard_capacity) + .map_err(|err| file_error(&path, 0, err))?; + files.push(OpenedSsdShard { + shard_id, + device_idx, + file, + }); + } + Ok(files) +} + +fn align_up_usize(value: usize, alignment: usize) -> KvResult { + value + .checked_add(alignment - 1) + .map(|v| v / alignment * alignment) + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("alignment overflow for value={}", value), + }) + }) +} + +fn align_up_u64(value: u64, alignment: u64) -> KvResult { + value + .checked_add(alignment - 1) + .map(|v| v / alignment * alignment) + .ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("alignment overflow for value={}", value), + }) + }) +} + +pub(crate) fn align_ssd_io_len(len: u64) -> KvResult { + align_up_u64(len, SSD_ALIGNMENT as u64) +} + +fn checked_add_u64(lhs: u64, rhs: u64, label: &str) -> KvResult { + lhs.checked_add(rhs).ok_or_else(|| { + KvError::Api(ApiError::InvalidArgument { + detail: format!("kv ssd {label} overflow: {lhs} + {rhs}"), + }) + }) +} + +fn validate_direct_io( + iovecs: impl IntoIterator, + offset: u64, +) -> io::Result<()> { + ensure_aligned("offset", offset as usize)?; + for (addr, len) in iovecs { + ensure_aligned("buffer address", addr)?; + ensure_aligned("iovec length", len)?; + } + Ok(()) +} + +fn ensure_aligned(name: &str, value: usize) -> io::Result<()> { + if value.is_multiple_of(SSD_ALIGNMENT) { + Ok(()) + } else { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("O_DIRECT {name} {value:#x} is not {SSD_ALIGNMENT}-byte aligned"), + )) + } +} + +fn file_error_for_entry(key: &KvSsdKey, offset: u64, err: io::Error) -> KvError { + KvError::Api(ApiError::FileWriteError { + path: format!("kv-ssd://{}@({},{})", key.key, key.put_id.0, key.put_id.1), + offset, + detail: err.to_string(), + }) +} + +fn file_error(path: &Path, offset: u64, err: io::Error) -> KvError { + KvError::Api(ApiError::FileWriteError { + path: path.to_string_lossy().to_string(), + offset, + detail: err.to_string(), + }) +} + +impl From for KvError { + fn from(err: io::Error) -> Self { + KvError::Api(ApiError::FileWriteError { + path: "kv-ssd://io".to_string(), + offset: 0, + detail: err.to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use uuid::Uuid; + + fn new_root() -> PathBuf { + std::env::current_dir() + .unwrap() + .join("target") + .join("fluxon_kv_ssd_tests") + .join(Uuid::new_v4().to_string()) + } + + async fn new_store(max_bytes: u64) -> KvSsdStorage { + KvSsdStorage::new(KvSsdStorageInit { + root_dirs: vec![new_root()], + max_bytes, + }) + .unwrap() + } + + fn test_key(key: &str, version: u64) -> KvSsdKey { + KvSsdKey { + key: key.to_string(), + put_id: (version, 0), + } + } + + fn prepare_ready(ring: &mut SsdRingBuffer, key: &KvSsdKey) -> SsdIndexEntry { + match ring.prepare_write(key.clone(), 500).unwrap() { + SsdPreparedWrite::Ready(entry) => entry, + other => panic!("expected ready SSD write, got {other:?}"), + } + } + + #[::tokio::test] + async fn persist_and_load_roundtrip() { + let store = new_store(1024 * 1024).await; + let data = b"hello from ssd"; + let put_id = (10, 1); + store.persist("k", put_id, data).await.unwrap(); + + let mut out = vec![0u8; data.len()]; + store + .load_into_addr( + "k", + put_id, + out.as_mut_ptr() as u64, + out.len() as u64, + out.len() as u64, + ) + .await + .unwrap(); + assert_eq!(out, data); + } + + #[::tokio::test] + async fn aligned_load_roundtrip_uses_direct_target() { + let store = new_store(1024 * 1024).await; + let data = (0..4096).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (11, 1); + store.persist("aligned", put_id, &data).await.unwrap(); + + let mut out = AlignedBuffer::zeroed(data.len()).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let entry = { + let key = KvSsdKey { + key: "aligned".to_string(), + put_id, + }; + store.inner.lock().ring.get(&key).unwrap() + }; + assert_eq!( + choose_read_path(&entry, target_addr, data.len() as u64, data.len() as u64), + SsdReadPath::Direct + ); + + store + .load_into_addr( + "aligned", + put_id, + target_addr, + data.len() as u64, + data.len() as u64, + ) + .await + .unwrap(); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[::tokio::test] + async fn chunked_load_roundtrip_streams_ready_chunks() { + let store = new_store(1024 * 1024).await; + let data = (0..2500).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (13, 1); + store.persist("chunked", put_id, &data).await.unwrap(); + + let mut out = + AlignedBuffer::zeroed(align_ssd_io_len(data.len() as u64).unwrap() as usize).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let (tx, mut rx) = ::tokio::sync::mpsc::channel(2); + let producer = store.load_into_addr_chunks( + "chunked", + put_id, + target_addr, + data.len() as u64, + out.len() as u64, + 1024, + 2, + tx, + ); + let consumer = async { + let mut chunks = Vec::new(); + while let Some(chunk) = rx.recv().await { + chunks.push((chunk.offset, chunk.len)); + } + chunks + }; + let (producer_res, mut chunks) = ::tokio::join!(producer, consumer); + producer_res.unwrap(); + chunks.sort_unstable(); + assert_eq!(chunks, vec![(0, 1024), (1024, 1024), (2048, 452)]); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[test] + fn read_path_uses_direct_for_aligned_target_with_enough_capacity() { + let aligned = SsdIndexEntry { + shard_id: 0, + begin: 0, + len: 4096, + aligned_len: 4096, + file_offset: 0, + }; + assert_eq!( + choose_read_path(&aligned, 4096, 4096, 4096), + SsdReadPath::Direct + ); + assert_eq!( + choose_read_path(&aligned, 4097, 4096, 4096), + SsdReadPath::Scratch + ); + + let unaligned_len = SsdIndexEntry { + len: 500, + aligned_len: 512, + ..aligned + }; + assert_eq!( + choose_read_path(&unaligned_len, 4096, 500, 512), + SsdReadPath::Direct + ); + assert_eq!( + choose_read_path(&unaligned_len, 4096, 500, 500), + SsdReadPath::Scratch + ); + } + + #[::tokio::test] + async fn unaligned_payload_loads_direct_when_stage_capacity_is_aligned() { + let store = new_store(1024 * 1024).await; + let data = (0..500).map(|idx| (idx % 251) as u8).collect::>(); + let put_id = (12, 1); + store.persist("unaligned", put_id, &data).await.unwrap(); + + let mut out = AlignedBuffer::zeroed(SSD_ALIGNMENT).unwrap(); + let target_addr = out.as_mut_ptr() as u64; + let entry = { + let key = KvSsdKey { + key: "unaligned".to_string(), + put_id, + }; + store.inner.lock().ring.get(&key).unwrap() + }; + assert_eq!(entry.len, data.len() as u64); + assert_eq!(entry.aligned_len, SSD_ALIGNMENT as u64); + assert_eq!( + choose_read_path(&entry, target_addr, data.len() as u64, SSD_ALIGNMENT as u64), + SsdReadPath::Direct + ); + + store + .load_into_addr( + "unaligned", + put_id, + target_addr, + data.len() as u64, + SSD_ALIGNMENT as u64, + ) + .await + .unwrap(); + + let out_slice = unsafe { std::slice::from_raw_parts(out.as_ptr(), data.len()) }; + assert_eq!(out_slice, data.as_slice()); + } + + #[::tokio::test] + async fn storage_deduplicates_root_dirs_on_same_device() { + let root_a = new_root(); + let root_b = new_root(); + let store = KvSsdStorage::new(KvSsdStorageInit { + root_dirs: vec![root_a.clone(), root_b.clone()], + max_bytes: 4 * SSD_ALIGNMENT as u64, + }) + .unwrap(); + + assert_eq!( + fs::metadata(&root_a).unwrap().dev(), + fs::metadata(&root_b).unwrap().dev() + ); + assert_eq!(store.root_dirs(), &[root_a.clone()]); + assert_eq!(store.devices.len(), 1); + assert_eq!(store.shard_to_device, vec![0, 0, 0, 0]); + assert!(root_a.join("shards/shard-000000.dat").exists()); + assert!(root_a.join("shards/shard-000001.dat").exists()); + assert!(root_a.join("shards/shard-000002.dat").exists()); + assert!(root_a.join("shards/shard-000003.dat").exists()); + assert!(!root_b.join("shards").exists()); + } + + #[test] + fn ring_prepare_write_on_shards_uses_only_allowed_shards() { + let mut ring = SsdRingBuffer::new(vec![1024, 1024, 1024, 1024]); + let mut allocated_shards = Vec::new(); + + for version in 0..4 { + let key = test_key("per-device", version); + let entry = match ring + .prepare_write_on_shards(key.clone(), 500, &[1, 3]) + .unwrap() + { + SsdPreparedWrite::Ready(entry) => entry, + other => panic!("expected ready SSD write, got {other:?}"), + }; + allocated_shards.push(entry.shard_id); + assert!(ring.commit(&key, true)); + } + + assert_eq!(allocated_shards, vec![1, 3, 1, 3]); + } + + #[::tokio::test] + async fn ring_keeps_new_entry_and_expires_old() { + let store = new_store(1024).await; + store.persist("old", (1, 0), &[1u8; 500]).await.unwrap(); + store.persist("filler", (2, 0), &[2u8; 500]).await.unwrap(); + store.persist("new", (3, 0), &[3u8; 500]).await.unwrap(); + + assert!(!store.has_entry("old", (1, 0)).await); + assert!(store.has_entry("filler", (2, 0)).await); + assert!(store.has_entry("new", (3, 0)).await); + } + + #[test] + fn ring_read_pin_blocks_overwrite_until_unpinned() { + let mut ring = SsdRingBuffer::new(vec![1024]); + let old = test_key("old", 1); + let filler = test_key("filler", 2); + let new = test_key("new", 3); + + let old_entry = prepare_ready(&mut ring, &old); + assert_eq!(old_entry.begin, 0); + assert!(ring.commit(&old, true)); + prepare_ready(&mut ring, &filler); + assert!(ring.commit(&filler, true)); + + let pinned = ring.pin_read(&old).unwrap(); + assert_eq!(pinned.begin, old_entry.begin); + assert!(matches!( + ring.prepare_write(new.clone(), 500).unwrap(), + SsdPreparedWrite::BlockedByBusyIo + )); + assert!(ring.get(&old).is_some()); + + ring.unpin_read(&old); + let new_entry = prepare_ready(&mut ring, &new); + assert_eq!(new_entry.file_offset, 0); + assert!(ring.commit(&new, true)); + assert!(ring.get(&old).is_none()); + } + + #[test] + fn ring_writing_entry_blocks_overwrite_until_write_finishes() { + let mut ring = SsdRingBuffer::new(vec![1024]); + let old = test_key("old", 1); + let filler = test_key("filler", 2); + let new = test_key("new", 3); + + let old_entry = prepare_ready(&mut ring, &old); + assert_eq!(old_entry.begin, 0); + prepare_ready(&mut ring, &filler); + + assert!(matches!( + ring.prepare_write(new.clone(), 500).unwrap(), + SsdPreparedWrite::BlockedByBusyIo + )); + + assert!(ring.commit(&old, true)); + let new_entry = prepare_ready(&mut ring, &new); + assert_eq!(new_entry.file_offset, 0); + } + + #[test] + fn safe_component_replaces_path_separators() { + assert_eq!(safe_path_component("owner/a:b"), "owner_a_b"); + } +} diff --git a/fluxon_rs/fluxon_kv/src/kv_test.rs b/fluxon_rs/fluxon_kv/src/kv_test.rs index 5f0a9e2..94d8ebe 100644 --- a/fluxon_rs/fluxon_kv/src/kv_test.rs +++ b/fluxon_rs/fluxon_kv/src/kv_test.rs @@ -11,9 +11,11 @@ use crate::cluster_manager::ClusterManagerRdmaControlInit; use crate::config::{ - ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, LargeFilePaths, MasterConfig, MonitoringConfig, - ProtocolConfig, ProtocolType, TestSpecConfig, TestSpecTransportMode, TransferEngineType, + ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, KvSsdStorageConfig, LargeFilePaths, + MasterConfig, MonitoringConfig, ProtocolConfig, ProtocolType, TestSpecConfig, + TestSpecTransportMode, TransferEngineType, }; +use crate::master_kv_router::msg_pack::GetSourceKind; use crate::run_master_with_test_overrides; use crate::{ClientRunTestOverrides, MasterRunTestOverrides, run_client_with_test_overrides}; // external client runs via run_client when contribution is zero @@ -38,6 +40,8 @@ const CLIENT_COMMUNICATION_VALUE: &[u8] = b"message_from_client1_to_client2"; const TRANSFER_DATA_PROBE_VALUE_LEN: usize = 256 * 1024; const KV_TEST_TRANSFER_PROBE_IO_TIMEOUT_SECS: u64 = 10; const KV_TEST_SHUTDOWN_TIMEOUT_SECS: u64 = 60; +const KV_TEST_SSD_STORAGE_BYTES: u64 = 64 * 1024 * 1024; +const KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS: u64 = 30; fn kv_test_run_scope() -> &'static str { static RUN_SCOPE: OnceLock = OnceLock::new(); @@ -610,6 +614,7 @@ struct KvTestClientOptions { enable_transfer_rpc_fast_path: Option, contribute_to_cluster_pool_size: Option, share_mem_path: Option, + ssd_storage: Option, etcd_mode: Option, } @@ -642,6 +647,10 @@ impl KvTestClientOptions { .share_mem_path .clone() .or_else(|| self.share_mem_path.clone()), + ssd_storage: overrides + .ssd_storage + .clone() + .or_else(|| self.ssd_storage.clone()), etcd_mode: overrides .etcd_mode .clone() @@ -650,6 +659,40 @@ impl KvTestClientOptions { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum KvTestStorageProfile { + Memory, + Ssd, + MemorySsd, +} + +impl KvTestStorageProfile { + fn round_suffix(self) -> &'static str { + match self { + Self::Memory => "", + Self::Ssd => "_ssd", + Self::MemorySsd => "_memory_ssd", + } + } + + fn ssd_storage(self) -> Option { + match self { + Self::Memory => None, + Self::Ssd | Self::MemorySsd => Some(KvSsdStorageConfig { + max_bytes: KV_TEST_SSD_STORAGE_BYTES, + }), + } + } + + fn requires_memory_source(self) -> bool { + matches!(self, Self::Memory | Self::MemorySsd) + } + + fn requires_ssd_source(self) -> bool { + matches!(self, Self::Ssd | Self::MemorySsd) + } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum KvTestRoundProfile { P2pOnly, @@ -760,6 +803,7 @@ fn kv_test_round_test_spec_config(round_profile: KvTestRoundProfile) -> TestSpec #[derive(Clone, Debug)] struct KvTestRoundOptions { round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, round_name: String, cluster_name: String, master_port: Option, @@ -803,6 +847,9 @@ impl KvTestRoundOptions { ) } + fn owner_sub_cluster(&self) -> String { + format!("{}_owners", self.round_name) + } } #[derive(Clone, Debug)] @@ -842,8 +889,7 @@ fn default_client_large_file_paths( instance_key: &str, contribute_to_cluster_pool_size: &ContributeToClusterPoolSize, ) -> LargeFilePaths { - if contribute_to_cluster_pool_size.dram == 0 - && contribute_to_cluster_pool_size.vram.is_empty() + if contribute_to_cluster_pool_size.dram == 0 && contribute_to_cluster_pool_size.vram.is_empty() { return LargeFilePaths { paths: Vec::new() }; } @@ -852,7 +898,10 @@ fn default_client_large_file_paths( } } -fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTestClientOptions { +fn default_owner_test_client_options( + round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, +) -> KvTestClientOptions { KvTestClientOptions { protocol_config: Some(round_profile.protocol_config()), transfer_engine: Some(round_profile.owner_transfer_engine()), @@ -861,6 +910,7 @@ fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTes enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: Some(default_owner_contribute_to_cluster_pool_size()), share_mem_path: None, + ssd_storage: storage_profile.ssd_storage(), etcd_mode: Some(KvTestEtcdMode::Enabled), } } @@ -874,6 +924,7 @@ fn default_master_test_client_options(round_profile: KvTestRoundProfile) -> KvTe enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: None, share_mem_path: None, + ssd_storage: None, etcd_mode: None, } } @@ -887,22 +938,31 @@ fn default_external_test_client_options() -> KvTestClientOptions { enable_transfer_rpc_fast_path: Some(false), contribute_to_cluster_pool_size: Some(default_external_contribute_to_cluster_pool_size()), share_mem_path: None, + ssd_storage: None, etcd_mode: Some(KvTestEtcdMode::Disabled), } } -fn new_kv_test_round(round_profile: KvTestRoundProfile) -> KvTestRoundOptions { - let round_name = round_profile.round_name(); +fn new_kv_test_round( + round_profile: KvTestRoundProfile, + storage_profile: KvTestStorageProfile, +) -> KvTestRoundOptions { + let round_name = format!( + "{}{}", + round_profile.round_name(), + storage_profile.round_suffix() + ); KvTestRoundOptions { round_profile, - round_name: round_name.to_string(), + storage_profile, + round_name: round_name.clone(), // Keep each process run on its own cluster namespace so a crashed/aborted previous run // cannot poison the next rerun with stale members. cluster_name: format!("test_cluster_{}_{}", round_name, kv_test_run_scope()), master_port: None, step8_master_port: None, master_options: default_master_test_client_options(round_profile), - owner_client_options: default_owner_test_client_options(round_profile), + owner_client_options: default_owner_test_client_options(round_profile, storage_profile), external_client_options: default_external_test_client_options(), } } @@ -919,15 +979,35 @@ fn default_kv_test_run_options() -> KvTestRunOptions { .filter(|item| !item.is_empty()) { let profile = match round_name { - "p2p_only" => KvTestRoundProfile::P2pOnly, + "p2p_only" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::Memory, + )); + continue; + } + "p2p_only_ssd" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::Ssd, + )); + continue; + } + "p2p_only_memory_ssd" => { + rounds.push(new_kv_test_round( + KvTestRoundProfile::P2pOnly, + KvTestStorageProfile::MemorySsd, + )); + continue; + } "rdma_transfer_only" => KvTestRoundProfile::RdmaTransferOnly, "rdma_transfer_with_rpc" => KvTestRoundProfile::RdmaTransferWithRpc, other => panic!( - "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, rdma_transfer_only, rdma_transfer_with_rpc", + "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, p2p_only_ssd, p2p_only_memory_ssd, rdma_transfer_only, rdma_transfer_with_rpc", other ), }; - rounds.push(new_kv_test_round(profile)); + rounds.push(new_kv_test_round(profile, KvTestStorageProfile::Memory)); } if rounds.is_empty() { panic!("FLUXON_KV_TEST_ROUNDS was set but produced no valid rounds"); @@ -937,9 +1017,17 @@ fn default_kv_test_run_options() -> KvTestRunOptions { KvTestRunOptions { rounds: vec![ - new_kv_test_round(KvTestRoundProfile::P2pOnly), - new_kv_test_round(KvTestRoundProfile::RdmaTransferOnly), - new_kv_test_round(KvTestRoundProfile::RdmaTransferWithRpc), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Memory), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::Ssd), + new_kv_test_round(KvTestRoundProfile::P2pOnly, KvTestStorageProfile::MemorySsd), + new_kv_test_round( + KvTestRoundProfile::RdmaTransferOnly, + KvTestStorageProfile::Memory, + ), + new_kv_test_round( + KvTestRoundProfile::RdmaTransferWithRpc, + KvTestStorageProfile::Memory, + ), ], } } @@ -1022,6 +1110,8 @@ fn build_client_launch( let contribute_to_cluster_pool_size = options .contribute_to_cluster_pool_size .unwrap_or(default_owner_contribute_to_cluster_pool_size()); + let is_external = contribute_to_cluster_pool_size.dram == 0 + && contribute_to_cluster_pool_size.vram.is_empty(); let share_mem_path = options .share_mem_path .unwrap_or_else(|| format!("/tmp/kvcache_shared_memory/{}", instance_key)); @@ -1043,7 +1133,11 @@ fn build_client_launch( enable_transfer_rpc_fast_path: options .enable_transfer_rpc_fast_path .expect("kv_test requires enable_transfer_rpc_fast_path to be set explicitly"), - sub_cluster: None, + sub_cluster: if is_external { + None + } else { + Some(round.owner_sub_cluster()) + }, }, // English note: // kv_test uses a per-instance shared memory path by default so each owner/external share @@ -1054,6 +1148,7 @@ fn build_client_launch( &instance_key, &contribute_to_cluster_pool_size, ), + ssd_storage: options.ssd_storage, // Mirror round intent into the generated config so logs and runtime behavior // agree on whether this launch is transfer_only vs transfer_with_rpc. test_spec_config: kv_test_round_test_spec_config(round.round_profile), @@ -1381,7 +1476,10 @@ async fn key_meta_cache_check( } } - tracing::info!("🔍 Starting PUT and GET in parallel: {}", parallel_unique_key); + tracing::info!( + "🔍 Starting PUT and GET in parallel: {}", + parallel_unique_key + ); for i in 0..10 { let (put_client, other_client) = if i % 2 == 0 { (client, client2) @@ -1420,7 +1518,9 @@ async fn key_meta_cache_check( } assert!( - put_client.client_kv_api().has_cached_key(parallel_unique_key), + put_client + .client_kv_api() + .has_cached_key(parallel_unique_key), "put client should have immediate local cache metadata for key {} after put time {}", parallel_unique_key, i @@ -1577,6 +1677,208 @@ async fn shutdown_framework_with_timeout(label: &str, framework: &crate::Framewo } } +fn build_storage_profile_probe_value(tag: &str) -> Vec { + const STORAGE_PROFILE_PROBE_VALUE_LEN: usize = 64 * 1024; + build_storage_profile_probe_value_with_len(tag, STORAGE_PROFILE_PROBE_VALUE_LEN) +} + +fn build_storage_profile_probe_value_with_len(tag: &str, len: usize) -> Vec { + let pattern = format!("kv_test_storage_profile:{tag}:").into_bytes(); + let mut value = Vec::with_capacity(len); + while value.len() < len { + value.extend_from_slice(pattern.as_slice()); + } + value.truncate(len); + value +} + +async fn force_evict_memory_replicas_for_storage_probe( + master_framework: &crate::Framework, + key: &str, +) { + let master_view = master_framework.master_kv_router_view(); + let deadline = + Instant::now() + Duration::from_secs(KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS); + let (put_id, memory_replica_nodes) = loop { + if let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) { + let put_id = route.put_id; + let memory_replica_nodes = route + .nodes_replicas + .read() + .keys() + .cloned() + .collect::>(); + let ssd_replica_count = route.ssd_replicas.read().len(); + if ssd_replica_count > 0 { + break (put_id, memory_replica_nodes); + } + } + + if Instant::now() >= deadline { + panic!( + "storage profile probe expected at least one SSD replica before memory eviction: key={} timeout={}s", + key, KV_TEST_STORAGE_PROFILE_SSD_ROUTE_TIMEOUT_SECS + ); + } + sleep(Duration::from_millis(50)).await; + }; + + for node_id in memory_replica_nodes { + crate::master_kv_router::delete::evict_one_kv_replica_for_node( + &master_view, + key.to_string(), + node_id.clone(), + put_id, + ) + .unwrap_or_else(|code| { + panic!( + "storage profile probe failed to evict memory replica: key={} node={} put_id=({},{}) code={}", + key, node_id, put_id.0, put_id.1, code + ) + }); + } + + let Some(route) = master_view.master_kv_router().inner().kv_routes.get(key) else { + panic!("storage profile probe route disappeared after memory replicas eviction: key={key}"); + }; + assert!( + route.nodes_replicas.read().is_empty(), + "storage profile probe memory replicas still exist after eviction: key={}", + key + ); + assert!( + !route.ssd_replicas.read().is_empty(), + "storage profile probe SSD replica disappeared after memory replicas eviction: key={}", + key + ); +} + +async fn assert_owner_get_source_kind( + reader_framework: &crate::Framework, + key: &str, + expected_value: &[u8], + expected_source_kind: GetSourceKind, +) { + let reader_view = reader_framework.client_kv_api_view().clone(); + let reader_api = reader_view.client_kv_api(); + let (mem_holder, get_info) = reader_api + .inner() + .get(key) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile probe get failed: key={} expected_source={:?} err={}", + key, expected_source_kind, err + ) + }) + .unwrap_or_else(|| { + panic!( + "storage profile probe get returned None: key={} expected_source={:?}", + key, expected_source_kind + ) + }); + assert_eq!( + mem_holder.bytes(), + expected_value, + "storage profile probe value mismatch for key={key}" + ); + let Some(get_info) = get_info else { + panic!( + "storage profile probe expected remote get info for key={} source={:?}", + key, expected_source_kind + ); + }; + assert_eq!( + get_info.source_kind(), + expected_source_kind, + "storage profile probe source kind mismatch for key={key}" + ); +} + +async fn run_non_rdma_storage_profile_coverage( + round: &KvTestRoundOptions, + master_framework: &crate::Framework, + writer_framework: &crate::Framework, +) -> Option> { + if round.round_profile != KvTestRoundProfile::P2pOnly { + return None; + } + + info!( + "📋 Storage profile coverage: round={} storage={:?}", + round.round_name, round.storage_profile + ); + + let writer_view = writer_framework.client_kv_api_view().clone(); + let writer_api = writer_view.client_kv_api(); + let storage_probe_put_opts = || { + crate::client_kv_api::PutOptionalArgs(vec![ + crate::client_kv_api::PutOptionalArg::PreferredSubCluster(round.owner_sub_cluster()), + ]) + }; + + let memory_key = format!("storage_profile_memory_key_{}", round.round_name); + let memory_value = build_storage_profile_probe_value(&format!("{}:memory", round.round_name)); + if round.storage_profile.requires_memory_source() { + writer_api + .inner() + .put(&memory_key, &memory_value, storage_probe_put_opts()) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile memory probe put failed: key={} err={}", + memory_key, err + ) + }); + } + + let ssd_key = format!("storage_profile_ssd_key_{}", round.round_name); + let ssd_value = build_storage_profile_probe_value_with_len( + &format!("{}:ssd", round.round_name), + 64 * 1024 + 123, + ); + if round.storage_profile.requires_ssd_source() { + writer_api + .inner() + .put(&ssd_key, &ssd_value, storage_probe_put_opts()) + .await + .unwrap_or_else(|err| { + panic!( + "storage profile SSD probe put failed: key={} err={}", + ssd_key, err + ) + }); + force_evict_memory_replicas_for_storage_probe(master_framework, &ssd_key).await; + } + + let reader_launch = new_client_launch(round, "test_storage_profile_reader", None); + let (reader_framework, _) = run_kv_test_client(reader_launch) + .await + .expect("Failed to start storage profile reader"); + + sleep(Duration::from_secs(10)).await; + + if round.storage_profile.requires_memory_source() { + assert_owner_get_source_kind( + &reader_framework, + &memory_key, + &memory_value, + GetSourceKind::Memory, + ) + .await; + } + if round.storage_profile.requires_ssd_source() { + assert_owner_get_source_kind(&reader_framework, &ssd_key, &ssd_value, GetSourceKind::Ssd) + .await; + } + + info!( + "✅ Storage profile coverage passed: round={} storage={:?}", + round.round_name, round.storage_profile + ); + Some(reader_framework) +} + async fn run_kv_step8(round: &KvTestRoundOptions) { info!("📋 Step 8: Verifying external client blocking and recovery behavior"); @@ -2720,6 +3022,9 @@ async fn run_kv_round(round: &KvTestRoundOptions) { info!("✅ Key meta cache testing completed"); } + let storage_profile_reader_framework = + run_non_rdma_storage_profile_coverage(round, &master_framework, &client1_framework).await; + // 清理旧资源 { info!("🧹 Cleaning up resources"); @@ -2743,6 +3048,14 @@ async fn run_kv_round(round: &KvTestRoundOptions) { .unwrap_or_else(|e| panic!("Client 1 framework shutdown failed: {}", e)); info!("✅ Client 1 framework shutdown successfully"); + if let Some(storage_profile_reader_framework) = storage_profile_reader_framework { + shutdown_framework_with_timeout( + "storage profile reader", + &storage_profile_reader_framework, + ) + .await; + } + master_framework .shutdown() .await diff --git a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs index c74b64a..43d3c09 100644 --- a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs +++ b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs @@ -148,6 +148,7 @@ fn new_client_config_with_cluster_and_dram( large_file_paths: crate::config::LargeFilePaths { paths: vec![format!("{}/large/{}", base, instance_key)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), }; println!("fluxonkv core created client config for test: {:?}", conf); diff --git a/fluxon_rs/fluxon_kv/src/lib.rs b/fluxon_rs/fluxon_kv/src/lib.rs index edaa386..3b1116d 100644 --- a/fluxon_rs/fluxon_kv/src/lib.rs +++ b/fluxon_rs/fluxon_kv/src/lib.rs @@ -7,6 +7,7 @@ pub mod external_client_api; pub mod panel_proxy; // #[cfg(test)] pub mod key_prefix; +pub mod kv_ssd_storage; #[cfg(feature = "test_bins")] pub mod kv_test; pub mod kvlease; @@ -797,6 +798,7 @@ fn build_side_transfer_worker_config( }, share_mem_path: owner_config.share_mem_path.clone(), large_file_paths: owner_config.large_file_paths.clone(), + ssd_storage: None, test_spec_config, }) } @@ -841,6 +843,7 @@ fn build_side_transfer_worker_config_yaml( cluster_name: side_config.cluster_name, share_mem_path: side_config.share_mem_path, large_file_paths: None, + ssd_storage: None, p2p_listen_port: side_config.fluxonkv_spec.p2p_listen_port, redis_compat: None, sub_cluster: None, @@ -1915,6 +1918,9 @@ async fn run_client_impl( if is_side_transfer_worker { metadata.insert("side_transfer_worker".to_string(), "true".to_string()); } + if !is_external && !is_side_transfer_worker && config.ssd_storage.is_some() { + metadata.insert("kv_ssd_storage".to_string(), "true".to_string()); + } // Local IPC routing requires both share-group owner id and the local IPC root. // The owner id is also published via a dedicated share-group key; we denormalize it into @@ -2004,6 +2010,20 @@ async fn run_client_impl( .await .map_err(|e| anyhow::anyhow!("Failed to initialize framework: {:#}", e))?; } else { + let ssd_storage = if is_side_transfer_worker { + None + } else if let Some(ssd_cfg) = config.ssd_storage.as_ref() { + let root_dirs = config + .large_file_paths + .kv_ssd_storage_dirs(&config.cluster_name, &config.instance_key) + .map_err(|err| anyhow::anyhow!("invalid kv ssd storage dirs: {}", err))?; + Some(crate::kv_ssd_storage::KvSsdStorageInit { + root_dirs, + max_bytes: ssd_cfg.max_bytes, + }) + } else { + None + }; let init_args = InitArgsOwner { cluster_manager_arg: ClusterManagerNewArg { etcd_endpoints: config.fluxonkv_spec.etcd_addresses.clone(), @@ -2036,6 +2056,7 @@ async fn run_client_impl( }, client_kv_api_arg: ClientKvApiNewArg { test_spec_config: config.test_spec_config.clone(), + ssd_storage, }, client_seg_pool_arg: ClientSegPoolNewArg { contribute_size: config.contribute_to_cluster_pool_size.clone(), @@ -2468,6 +2489,7 @@ mod tests { large_file_paths: crate::config::LargeFilePaths { paths: vec!["/tmp/fluxon_side_transfer_test_large".to_string()], }, + ssd_storage: None, test_spec_config: TestSpecConfig { enable_side_transfer: true, side_transfer_worker_count: 4, @@ -2736,8 +2758,8 @@ mod tests { large_file_paths: crate::config::LargeFilePaths { paths: vec![owner_large_root.to_string_lossy().into_owned()], }, - protocol_version: - fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(), + protocol_version: fluxon_util::git_version_build_record::get_current_git_commitid() + .unwrap(), write_ts: Some(chrono::Utc::now().timestamp_micros()), }; let shared_meta_json = serde_json::to_string(&shared_meta).unwrap(); @@ -2773,6 +2795,7 @@ mod tests { }, share_mem_path: share_mem_root.to_string_lossy().into_owned(), large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), }; diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs index 12a55ee..52ac76e 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/delete.rs @@ -130,7 +130,7 @@ pub fn evict_one_kv_replica_for_node( return Ok(()); } - let last_replica_gone = route.nodes_replicas.read().is_empty(); + let last_replica_gone = !route.has_live_replica(); if last_replica_gone { let removed = view .master_kv_router() diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs index 8c17155..346df40 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/get.rs @@ -2,9 +2,10 @@ use super::{ InflightGetInfo, KvRouteInfo, MasterKvRouterView, NodeValueReplicaDesc, OwnerHoldingGetInfo, msg_pack::{ GetAllocationMode, GetDoneReq, GetDoneResp, GetMetaReq, GetMetaResp, GetRevokeReq, - GetRevokeResp, GetStartReq, GetStartResp, + GetRevokeResp, GetSourceKind, GetStartReq, GetStartResp, }, }; +use crate::kv_ssd_storage::{SSD_ALIGNMENT, align_ssd_io_len}; use crate::master_kv_router::OneKvNodesRoutes; use crate::master_kv_router::put::PutIDForAKey; use crate::memholder::MemholderManagerTrait; @@ -82,7 +83,7 @@ pub async fn handle_get_start( let mut remove_in_kv_routes = false; if let Some(one_kv_nodes_routes) = view.master_kv_router().inner().kv_routes.get(key) { one_kv_nodes_routes.clean_up_tomb_nodes_replicas(put_id, tombs, view); - if one_kv_nodes_routes.nodes_replicas.read().is_empty() { + if !one_kv_nodes_routes.has_live_replica() { remove_in_kv_routes = true; } } @@ -113,6 +114,67 @@ pub async fn handle_get_start( }, ) } + fn allocate_get_buffer_on_node( + view: &MasterKvRouterView, + node_id: &NodeID, + len: u64, + get_id: u64, + purpose: &str, + ) -> Result, msg_and_error::KvError> { + let node_allocators = view.master_seg_manager().get_node_allocators(node_id); + if node_allocators.is_empty() { + tracing::info!( + "No allocators found for {} during get: {}, node is not ready", + purpose, + node_id + ); + return Err(msg_and_error::KvError::Unreachable( + msg_and_error::UnreachableError::OwnerNoSeg { detail: "config=0 initializes as external; non-zero initializes as owner; the owner must have memory space (segment)".to_string() } + )); + } + + let allocator = node_allocators.choose(&mut rand::thread_rng()).unwrap(); + let mut allocated_addr: Option = None; + for attempt in 1..=3 { + if let Ok(allocation) = allocator.allocate(len) { + allocated_addr = Some(allocation); + break; + } else { + tracing::info!( + "{} allocation attempt {}/3 failed for get_id {} on node {}", + purpose, + attempt, + get_id, + node_id + ); + } + } + if let Some(allocation) = allocated_addr { + return Ok(Arc::new(allocation)); + } + + let total = allocator.total_size_bytes(); + let used = allocator.used_size_bytes(); + let free = total.saturating_sub(used); + Err(msg_and_error::KvError::Api( + msg_and_error::ApiError::NoSpace { + node: node_id.as_ref().to_string(), + segment: allocator.seg_device_id.clone(), + total_capacity: total, + free_capacity: free, + }, + )) + } + fn align_ssd_stage_addr(raw_addr: u64) -> Result { + raw_addr + .checked_add(SSD_ALIGNMENT as u64 - 1) + .map(|addr| addr / SSD_ALIGNMENT as u64 * SSD_ALIGNMENT as u64) + .ok_or_else(|| { + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!("ssd source staging address alignment overflow: {raw_addr}"), + }) + }) + } tracing::debug!("Handling GetStartReq: {:?}", req.serialize_part); @@ -253,11 +315,13 @@ pub async fn handle_get_start( put_id: one_kv_nodes_routes.put_id, get_id, node_id: resp_node_id.clone().into(), + source_kind: GetSourceKind::Memory, src_addr: resp_src_addr, target_addr: resp_target_addr, src_base_addr: resp_src_base, target_base_addr: resp_target_base, len: src_allocation.size(), + ssd_stage_len: 0, error_code: msg_and_error::OK, error_json: String::new(), server_process_us: 0, @@ -270,8 +334,10 @@ pub async fn handle_get_start( req_node_id, len: src_allocation.size(), allocation: target_allocation, // 存储target allocation + source_allocation: None, route: one_kv_nodes_routes.clone(), allocation_mode, + source_kind: GetSourceKind::Memory, }; view.master_kv_router() @@ -308,6 +374,167 @@ pub async fn handle_get_start( }, ); } + + let ssd_replicas = one_kv_nodes_routes.ssd_replicas.read().clone(); + let mut ssd_replica_keys = ssd_replicas.keys().collect::>(); + while !ssd_replica_keys.is_empty() { + let to_remove_idx = rand::thread_rng().gen_range(0..ssd_replica_keys.len()); + let selected_ssd_key = ssd_replica_keys.remove(to_remove_idx); + let ssd_replica = ssd_replicas + .get(&*selected_ssd_key) + .expect("selected SSD replica key must exist"); + if ssd_replica.tomb_tag.is_tomb() { + tombs.insert(selected_ssd_key.to_owned()); + } else { + let ssd_stage_len = match align_ssd_io_len(ssd_replica.len) { + Ok(len) => len, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_alloc_len = match ssd_stage_len.checked_add(SSD_ALIGNMENT as u64 - 1) { + Some(len) => len, + None => { + let err = + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!( + "ssd source staging allocation length overflow: {ssd_stage_len}" + ), + }); + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_allocation = match allocate_get_buffer_on_node( + &view, + &ssd_replica.node_id, + source_alloc_len, + get_id, + "ssd source staging", + ) { + Ok(allocation) => allocation, + Err(err) => { + tracing::info!( + "Skipping SSD source for get_id {} on node {}: {}", + get_id, + ssd_replica.node_id, + err + ); + continue; + } + }; + let target_allocation = match allocate_get_buffer_on_node( + &view, + &req_node_id, + ssd_replica.len, + get_id, + "requesting target", + ) { + Ok(allocation) => allocation, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let allocation_mode = if one_kv_nodes_routes.try_reserve_get_durable_slot() { + GetAllocationMode::DurableReplica + } else { + GetAllocationMode::Temporary + }; + let source_base = source_allocation.base_addr(); + let source_raw_addr = match source_base.checked_add(source_allocation.addr()) { + Some(addr) => addr, + None => { + let err = + msg_and_error::KvError::Api(msg_and_error::ApiError::InvalidArgument { + detail: format!( + "ssd source staging raw address overflow: base={} offset={}", + source_base, + source_allocation.addr() + ), + }); + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let source_addr = match align_ssd_stage_addr(source_raw_addr) { + Ok(addr) => addr, + Err(err) => { + return failed_resp_err( + err, + Some((tombs, one_kv_nodes_routes.put_id)), + &view, + &req.serialize_part.key, + ); + } + }; + let target_base = target_allocation.base_addr(); + let target_addr = target_base + target_allocation.addr(); + let resp = GetStartResp { + put_id: one_kv_nodes_routes.put_id, + get_id, + node_id: ssd_replica.node_id.clone().into(), + source_kind: GetSourceKind::Ssd, + src_addr: source_addr, + target_addr, + src_base_addr: source_base, + target_base_addr: target_base, + len: ssd_replica.len, + ssd_stage_len, + error_code: msg_and_error::OK, + error_json: String::new(), + server_process_us: 0, + }; + let info = InflightGetInfo { + put_id: one_kv_nodes_routes.put_id, + src_node_id: ssd_replica.node_id.clone(), + key: req.serialize_part.key.clone(), + req_node_id, + len: ssd_replica.len, + allocation: target_allocation, + source_allocation: Some(source_allocation), + route: one_kv_nodes_routes.clone(), + allocation_mode, + source_kind: GetSourceKind::Ssd, + }; + + view.master_kv_router() + .inner() + .inflight_gets + .insert(get_id, info) + .await; + + clean_up_tombs( + &view, + Some((tombs, one_kv_nodes_routes.put_id)), + &req.serialize_part.key, + ); + return ( + get_id, + MsgPack { + serialize_part: resp, + raw_bytes: Vec::new(), + }, + ); + } + } tracing::info!("Key not found: {}", req.serialize_part.key); { let err = msg_and_error::KvError::Api(msg_and_error::ApiError::KeyNotFound { @@ -322,6 +549,64 @@ pub async fn handle_get_start( } } +fn drop_failed_ssd_source(view: &MasterKvRouterView, inflight_info: &InflightGetInfo) { + if inflight_info.source_kind != GetSourceKind::Ssd { + tracing::warn!( + "Ignoring drop_ssd_source for non-SSD get: get_key={} put_id=({},{}) source_kind={:?}", + inflight_info.key, + inflight_info.put_id.0, + inflight_info.put_id.1, + inflight_info.source_kind + ); + return; + } + + let route = inflight_info.route.clone(); + if route.put_id != inflight_info.put_id { + return; + } + + let removed = route + .ssd_replicas + .write() + .remove(&inflight_info.src_node_id) + .is_some(); + if !removed { + return; + } + + tracing::warn!( + "Removed failed SSD replica: key={} node={} put_id=({},{})", + inflight_info.key, + inflight_info.src_node_id, + inflight_info.put_id.0, + inflight_info.put_id.1 + ); + + if route.has_live_replica() { + return; + } + + let route_for_compare = route.clone(); + let removed_route = view + .master_kv_router() + .inner() + .kv_routes + .remove_if(&inflight_info.key, |_, current| { + Arc::ptr_eq(current, &route_for_compare) && current.put_id == inflight_info.put_id + }) + .is_some(); + if removed_route && view.master_kv_router().prefix_index_enabled() { + let view_task = view.clone(); + let key_for_prefix = inflight_info.key.clone(); + let _ = view.spawn("ssd_failure_remove_prefix_index", async move { + let inner = view_task.master_kv_router().inner(); + let mut tree = inner.prefix_index.write().await; + tree.remove(&key_for_prefix); + }); + } +} + pub async fn handle_get_revoke( view: MasterKvRouterView, req: MsgPack, @@ -338,6 +623,9 @@ pub async fn handle_get_revoke( .remove(&get_id) .await { + if req.serialize_part.drop_ssd_source { + drop_failed_ssd_source(&view, &inflight_info); + } inflight_info.release_durable_slot_if_needed(); tracing::info!("Revoked get operation with get_id: {}", get_id); } else { @@ -381,7 +669,6 @@ pub async fn handle_get_done( .next_holder_id .fetch_add(1, Ordering::Relaxed); - let src_node_id = inflight_info.src_node_id; let key = inflight_info.key; // Create holding info @@ -404,7 +691,7 @@ pub async fn handle_get_done( if one_kv_nodes_routes.put_id == inflight_info.put_id { let mut nodes_replicas = one_kv_nodes_routes.nodes_replicas.write(); if let Some(tomb_tag) = - view.master_seg_manager().get_node_tomb_tag(&src_node_id) + view.master_seg_manager().get_node_tomb_tag(&req_node_id) { if !tomb_tag.is_tomb() { nodes_replicas.insert( @@ -632,6 +919,21 @@ pub async fn handle_get_meta( raw_bytes: Vec::new(), }; } + let ssd_replicas = (*one_kv_nodes_routes.ssd_replicas.read()).clone(); + for (_, kv_info) in ssd_replicas.iter() { + if kv_info.tomb_tag.is_tomb() { + continue; + } + return MsgPack { + serialize_part: GetMetaResp { + exists: true, + len: kv_info.len, + error_code: msg_and_error::OK, + error_json: String::new(), + }, + raw_bytes: Vec::new(), + }; + } // if let Some((_, kv_info)) = replicas.iter().next() { // let len = kv_info.allocation.size(); diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs index ee4ca2b..afbfc41 100644 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/mod.rs @@ -15,13 +15,14 @@ use self::{ msg_pack::{ BatchDeleteAckReq, BatchDeleteClientKvMetaCacheReq, CountPrefixReq, CountPrefixResp, DeleteAckReq, DeleteReq, GetAllocationMode, GetDoneReq, GetMetaReq, GetRevokeReq, - GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq, + GetSourceKind, GetStartReq, PutDoneReq, PutRevokeReq, PutStartReq, SsdReplicaCommitReq, }, placement::{PlacementDefault, PlacementPolicy}, - put::{handle_put_done, handle_put_revoke, handle_put_start}, + put::{handle_put_done, handle_put_revoke, handle_put_start, handle_ssd_replica_commit}, }; use crate::ClientKvApiAccessTrait; use crate::client_kv_api::ClientKvApi; +use crate::client_kv_api::msg_pack::SsdReplicaPersistReq; use crate::cluster_manager::{ ClusterEvent, ClusterManager, ClusterManagerAccessTrait, NodeID, NodeIDString, }; @@ -116,8 +117,10 @@ pub struct InflightGetInfo { pub req_node_id: NodeID, pub len: u64, pub allocation: Arc, + pub source_allocation: Option>, pub route: Arc, pub allocation_mode: GetAllocationMode, + pub source_kind: GetSourceKind, } impl InflightGetInfo { @@ -201,6 +204,13 @@ pub struct KvRouteInfo { pub tomb_tag: NodeTombTag, } +#[derive(Clone, Debug)] +pub struct KvSsdRouteInfo { + pub node_id: NodeID, + pub len: u64, + pub tomb_tag: NodeTombTag, +} + #[derive(Debug)] pub struct OneKvNodesRoutes { /// the version id for a kv put operation @@ -230,6 +240,8 @@ pub struct OneKvNodesRoutes { /// node_id -> KvRouteInfo pub nodes_replicas: RwLock>, + /// node_id -> SSD replica metadata for the same key-version. + pub ssd_replicas: RwLock>, pub get_durable_slots_used: AtomicU32, } @@ -247,9 +259,16 @@ impl OneKvNodesRoutes { let mut nodes_replicas = self.nodes_replicas.write(); nodes_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id)); + let mut ssd_replicas = self.ssd_replicas.write(); + ssd_replicas.retain(|_, kv_info| !tombs.contains(&kv_info.node_id)); + return true; } + fn has_live_replica(&self) -> bool { + !self.nodes_replicas.read().is_empty() || !self.ssd_replicas.read().is_empty() + } + fn try_reserve_get_durable_slot(&self) -> bool { self.get_durable_slots_used .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| { @@ -283,6 +302,7 @@ mod tests { put_id: (1, 0), lease_id: None, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }; @@ -607,6 +627,7 @@ impl MasterKvRouter { fn register_rpc_callers(&self) { RPCCaller::::new().regist(self.0.view().p2p_module()); + RPCCaller::::new().regist(self.0.view().p2p_module()); } fn register_rpc_handlers(&self) { @@ -766,6 +787,22 @@ impl MasterKvRouter { Ok(()) }); + let view = self.0.view().clone(); + RPCHandler::::new().regist(p2p, move |resp, msg| { + let view = view.clone(); + let view2 = view.clone(); + let view_task = view2.clone(); + let _ = view.spawn("rpc_ssd_replica_commit", async move { + let t0 = Utc::now().timestamp_micros(); + let mut ack = handle_ssd_replica_commit(view_task, msg).await; + ack.serialize_part.server_process_us = Utc::now().timestamp_micros() - t0; + if let Err(e) = resp.send_resp(ack).await { + error!("Failed to send SsdReplicaCommitResp: {:?}", e); + } + }); + Ok(()) + }); + // --- MemHolder Handlers --- // let view = inner.view.clone(); // RPCHandler::::new().regist(p2p, move |resp, msg| { diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs index 9d5eb1d..bdd85b6 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/msg_pack.rs @@ -18,6 +18,13 @@ pub enum GetAllocationMode { DurableReplica = 2, } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Encode, Decode)] +pub enum GetSourceKind { + #[default] + Memory = 0, + Ssd = 1, +} + #[derive(Default, Debug, Clone, Encode, Decode)] pub struct GetStartReq { pub key: String, @@ -32,6 +39,7 @@ pub struct GetStartResp { pub get_id: u64, pub node_id: NodeIDString, pub put_id: PutIDForAKey, + pub source_kind: GetSourceKind, // absolute addresses because Mooncake transfer engine requires absolute addresses (not offsets) pub target_addr: u64, pub src_addr: u64, @@ -39,6 +47,8 @@ pub struct GetStartResp { pub target_base_addr: u64, pub src_base_addr: u64, pub len: u64, + /// SSD source staging bytes available at src_addr. Zero for memory sources. + pub ssd_stage_len: u64, pub error_code: ErrorCode, pub error_json: String, /// Server-side processing time in microseconds for this RPC handler @@ -56,6 +66,8 @@ impl RPCReq for GetStartReq { #[derive(Default, Debug, Clone, Encode, Decode)] pub struct GetRevokeReq { pub get_id: u64, + /// True only when an SSD stage failed and the source must be removed from routing. + pub drop_ssd_source: bool, } impl MsgPackSerializePart for GetRevokeReq { fn msg_id(&self) -> u32 { @@ -250,6 +262,34 @@ impl RPCReq for PutDoneReq { type Resp = PutDoneResp; } +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaCommitReq { + pub key: String, + pub put_id: PutIDForAKey, + pub node_id: NodeIDString, + pub len: u64, +} +impl MsgPackSerializePart for SsdReplicaCommitReq { + fn msg_id(&self) -> u32 { + MsgId::SsdReplicaCommitReq as u32 + } +} +#[derive(Default, Debug, Clone, Encode, Decode)] +pub struct SsdReplicaCommitResp { + pub error_code: ErrorCode, + pub error_json: String, + /// Server-side processing time in microseconds for this RPC handler + pub server_process_us: i64, +} +impl MsgPackSerializePart for SsdReplicaCommitResp { + fn msg_id(&self) -> u32 { + MsgId::SsdReplicaCommitResp as u32 + } +} +impl RPCReq for SsdReplicaCommitReq { + type Resp = SsdReplicaCommitResp; +} + // --- RPC for MemHolder KeepAlive --- #[derive(Default, Debug, Clone, Encode, Decode)] diff --git a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs index 70d8858..06e41cc 100755 --- a/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs +++ b/fluxon_rs/fluxon_kv/src/master_kv_router/put.rs @@ -1,15 +1,19 @@ -use super::NodeValueReplicaDesc; use super::{ InflightPutAllocation, InflightPutInfo, KvRouteInfo, MasterKvRouterView, PutPlacementMode, - msg_pack::{PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp}, + msg_pack::{ + PutDoneReq, PutDoneResp, PutRevokeReq, PutRevokeResp, PutStartReq, PutStartResp, + SsdReplicaCommitReq, SsdReplicaCommitResp, + }, placement::PutPlacementTarget, }; +use super::{KvSsdRouteInfo, NodeValueReplicaDesc}; +use crate::client_kv_api::msg_pack::SsdReplicaPersistReq; use crate::master_kv_router::OneKvNodesRoutes; use crate::master_kv_router::delete::DeleteKeyInfo; use crate::{ cluster_manager::{META_KEY_LOCAL_IPC_ROOT, NodeID}, master_seg_manager::one_seg_allocator::Allocation, - p2p::msg_pack::MsgPack, + p2p::msg_pack::{MsgPack, RPCCaller}, rpcresp_kvresult_convert::msg_and_error, }; use fluxon_commu::{META_KEY_SHARED_STORAGE_NODE_ID, META_KEY_SHARED_STORAGE_NODE_START_TIME}; @@ -19,6 +23,7 @@ use rand::seq::SliceRandom; use std::{ collections::HashMap, sync::{Arc, atomic::AtomicU32}, + time::Duration, }; pub type PutIDForAKey = (u64, u32); @@ -474,6 +479,171 @@ pub async fn handle_put_revoke( } } +fn spawn_ssd_replica_persist_request( + view: &MasterKvRouterView, + key: String, + put_id: PutIDForAKey, + node_id: NodeID, + len: u64, + allocation: Arc, +) { + let target_addr = allocation.base_addr() + allocation.addr(); + let view = view.clone(); + let view_task = view.clone(); + let _ = view.spawn("post_put_ssd_replica_persist", async move { + let _allocation_guard = allocation; + let req = MsgPack { + serialize_part: SsdReplicaPersistReq { + key: key.clone(), + put_id, + target_addr, + len, + }, + raw_bytes: Vec::new(), + }; + let resp = RPCCaller::::new() + .call( + view_task.p2p_module(), + node_id.clone(), + req, + Some(Duration::from_secs(60)), + 2, + ) + .await; + match resp { + Ok(resp) => { + if let Err(err) = crate::rpcresp_kvresult_convert::try_from_code( + resp.serialize_part.error_code, + resp.serialize_part.error_json, + ) { + tracing::warn!( + "SSD replica persist failed: key={} put_id=({},{}) node={} err={}", + key, + put_id.0, + put_id.1, + node_id, + err + ); + } else if resp.serialize_part.persisted { + tracing::debug!( + "SSD replica persist completed: key={} put_id=({},{}) node={}", + key, + put_id.0, + put_id.1, + node_id + ); + } else { + tracing::debug!( + "SSD replica persist skipped because owner has no SSD store: key={} put_id=({},{}) node={}", + key, + put_id.0, + put_id.1, + node_id + ); + } + } + Err(err) => { + tracing::warn!( + "SSD replica persist RPC failed: key={} put_id=({},{}) node={} err={:?}", + key, + put_id.0, + put_id.1, + node_id, + err + ); + } + } + }); +} + +fn ok_ssd_replica_commit_resp() -> MsgPack { + MsgPack { + serialize_part: SsdReplicaCommitResp { + error_code: msg_and_error::OK, + error_json: String::new(), + server_process_us: 0, + }, + raw_bytes: Vec::new(), + } +} + +pub async fn handle_ssd_replica_commit( + view: MasterKvRouterView, + req: MsgPack, +) -> MsgPack { + let req = req.serialize_part; + let node_id: NodeID = req.node_id.clone().into(); + let Some(route_ref) = view.master_kv_router().inner().kv_routes.get(&req.key) else { + tracing::debug!( + "Ignoring SSD replica commit for missing key: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + }; + let route = route_ref.value().clone(); + drop(route_ref); + + if route.put_id != req.put_id { + tracing::debug!( + "Ignoring stale SSD replica commit: key={} req_put_id=({},{}) current_put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + route.put_id.0, + route.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + } + + let tomb_tag = { + let replicas = route.nodes_replicas.read(); + let Some(memory_replica) = replicas.get(&node_id) else { + tracing::debug!( + "Ignoring SSD replica commit without matching memory replica: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + }; + memory_replica.tomb_tag.clone() + }; + + if tomb_tag.is_tomb() { + tracing::debug!( + "Ignoring SSD replica commit for tombed node: key={} put_id=({},{}) node={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id + ); + return ok_ssd_replica_commit_resp(); + } + + route.ssd_replicas.write().insert( + node_id.clone(), + KvSsdRouteInfo { + node_id, + len: req.len, + tomb_tag, + }, + ); + tracing::debug!( + "Committed SSD replica route: key={} put_id=({},{}) node={} len={}", + req.key, + req.put_id.0, + req.put_id.1, + req.node_id, + req.len + ); + ok_ssd_replica_commit_resp() +} + pub async fn handle_put_done( view: MasterKvRouterView, req: MsgPack, @@ -488,6 +658,7 @@ pub async fn handle_put_done( if let Some(InflightPutInfo { node_id, key, + len, src_target_allocation, .. }) = view @@ -631,8 +802,9 @@ pub async fn handle_put_done( let completed_info = KvRouteInfo { node_id: node_id.clone(), allocation: Arc::new(target_allocation), - tomb_tag, + tomb_tag: tomb_tag.clone(), }; + let target_allocation_for_ssd = Arc::clone(&completed_info.allocation); // Insert into kv_routes with replica support let mut old_one_kv_routes: Option> = None; @@ -649,6 +821,7 @@ pub async fn handle_put_done( put_id, lease_id: lease_id_opt, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }) }); @@ -659,6 +832,7 @@ pub async fn handle_put_done( put_id, lease_id: lease_id_opt, nodes_replicas: RwLock::new(HashMap::new()), + ssd_replicas: RwLock::new(HashMap::new()), get_durable_slots_used: AtomicU32::new(0), }); } @@ -668,6 +842,15 @@ pub async fn handle_put_done( .insert(node_id.clone(), completed_info); } + spawn_ssd_replica_persist_request( + &view, + key.clone(), + put_id, + node_id.clone(), + len, + target_allocation_for_ssd, + ); + if let Some(old) = old_one_kv_routes { if let Err(err) = view .master_kv_router() diff --git a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs index 5c20cc1..5d344c9 100755 --- a/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs +++ b/fluxon_rs/fluxon_kv/src/master_lease_manager/lease_manager_test.rs @@ -22,7 +22,8 @@ async fn test1_lease_expire_removes_keys() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t1", "lease_client_t1").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t1", "lease_client_t1").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -82,7 +83,8 @@ async fn test2_rebind_to_new_lease_preserves_until_new_expire() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t2", "lease_client_t2").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t2", "lease_client_t2").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -161,7 +163,8 @@ async fn test3_keepalive() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t3", "lease_client_t3").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t3", "lease_client_t3").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; @@ -236,7 +239,8 @@ async fn test4_delete_under_lease_then_get_fails() { unsafe { std::env::set_var("FLUXON_LOG", "debug"); } - let (master_fw, client_fw) = start_master_and_client("lease_master_t4", "lease_client_t4").await; + let (master_fw, client_fw) = + start_master_and_client("lease_master_t4", "lease_client_t4").await; let client_view = client_fw.client_kv_api_view(); wait_master_ready(&client_view).await; diff --git a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs index 692a9a0..cfd6d55 100644 --- a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs +++ b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs @@ -101,6 +101,7 @@ fn new_client_config_with_size( large_file_paths: crate::config::LargeFilePaths { paths: vec![format!("/tmp/kvcache_large/{}", instance_key)], }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } @@ -134,6 +135,7 @@ fn new_zero_contribution_client_config( }, share_mem_path: format!("/tmp/kvcache_shared_memory/{}", owner_instance_key), large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, + ssd_storage: None, test_spec_config: TestSpecConfig::default(), } } diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs index 42a9cbc..def8b1c 100644 --- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs +++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/msg_and_error.rs @@ -35,6 +35,8 @@ pub enum MsgId { DeleteAckResp = 3024, BatchDeleteAckReq = 3029, BatchDeleteAckResp = 3030, + SsdReplicaCommitReq = 3031, + SsdReplicaCommitResp = 3032, GetMetaReq = 3019, GetMetaResp = 3020, BatchDeleteClientKvMetaCacheReq = 3021, diff --git a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs index b6eb7d6..a5a18b4 100755 --- a/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs +++ b/fluxon_rs/fluxon_kv/src/rpcresp_kvresult_convert/rpcresp_kvresult_convert.rs @@ -3,11 +3,12 @@ use super::msg_and_error::{ErrorCode, KvError, KvResult}; use crate::client_kv_api::msg_pack::{ ExternalDeleteAckResp, ExternalDeleteResp, ExternalGetResp, ExternalIsExistResp, ExternalPutCommitResp, ExternalPutRevokeResp, ExternalPutStartResp, ExternalPutTransferEndResp, + SsdReplicaPersistResp, SsdStageReadResp, }; use crate::master_kv_router::msg_pack::{ BatchDeleteAckResp, BatchDeleteClientKvMetaCacheResp, DeleteAckResp, DeleteResp, GetDoneResp, GetMasterOnlyMetricPartResp, GetMetaResp, GetRevokeResp, GetStartResp, MemHolderKeepAliveResp, - MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp, + MemHolderReleaseResp, PutDoneResp, PutRevokeResp, PutStartResp, SsdReplicaCommitResp, }; use crate::master_seg_manager::msg_pack::RequestSegmentRegistrationResp; use crate::memholder::ExternalMemHolderInfo; @@ -232,6 +233,26 @@ impl FromError for ExternalDeleteAckResp { } } } +impl FromError for SsdStageReadResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} +impl FromError for SsdReplicaPersistResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} // ---- FromError for Master KV Router Resps ---- impl FromError for GetStartResp { @@ -294,6 +315,16 @@ impl FromError for PutDoneResp { } } } +impl FromError for SsdReplicaCommitResp { + fn from_error(e: &KvError) -> Self { + let code = e.code(); + Self { + error_code: code, + error_json: e.to_json(), + ..Default::default() + } + } +} impl FromError for MemHolderKeepAliveResp { fn from_error(e: &KvError) -> Self { let code = e.code();