From 3c0e391973071e64eeb99b8e6d2c59039b6a2554 Mon Sep 17 00:00:00 2001 From: Yuhan Zhou Date: Tue, 3 Mar 2026 11:05:38 +0800 Subject: [PATCH 01/10] fix(dev): 1) fix torch version check; 2) sync bazel module sources with uv.lock --- README.md | 7 ++++++- tools/torch_version_manager.py | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 69fb39063..31f023c3c 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,9 @@ bash tools/build_proto_python.sh uv venv uv sync --all-extras --all-groups --verbose +# Update bazel module sources according to uv.lock +uv run python tools/update_module_http_archives.py --lockfile uv.lock --module MODULE.bazel + # Build C++ core + Python extension BUILD_CORE=1 BUILD_EXTENSION=1 uv run -vvv setup.py build_ext ``` @@ -59,11 +62,13 @@ BUILD_CORE=1 BUILD_EXTENSION=1 uv run -vvv setup.py build_ext - If Bazel fails to download LLVM, run `bash tools/download_and_set_local_llvm.sh`. -- If Bazel hits missing header errors like `fatal error: absl/log/log.h: No such file or directory`, the repo-root `external/` symlink is likely stale. Fix it with: +- If Bazel hits missing header errors like `fatal error: absl/log/log.h: No such file or directory`, the repo-root `external/` or `bazel-bin` symlink is likely stale. Fix it with: ```bash rm -f external && ln -s $(bazel info output_base)/external external + rm -f bazel-bin && ln -s $(bazel info bazel-bin) bazel-bin ``` + - If importing `tensorcast._C` fails with `cannot allocate memory in static TLS block`, rebuild on the latest `main` (TensorCast disables jemalloc initial-exec TLS to make `dlopen()` safe). As a temporary workaround for older builds, run with `GLIBC_TUNABLES=glibc.rtld.optional_static_tls=32768`. ### Run services (local) diff --git a/tools/torch_version_manager.py b/tools/torch_version_manager.py index d8eaee636..859624600 100644 --- a/tools/torch_version_manager.py +++ b/tools/torch_version_manager.py @@ -139,8 +139,10 @@ def validate_torch_versions(raise_on_error: bool = True) -> Tuple[bool, Dict[str versions.update(_get_pyproject_torch_versions()) - unique_versions = set(versions.values()) - is_consistent = len(unique_versions) <= 1 + # Check for consistency using base versions (ignore local tags like +cu128 or +cpu) + base_versions = {k: v.split("+")[0] for k, v in versions.items()} + unique_base_versions = set(base_versions.values()) + is_consistent = len(unique_base_versions) <= 1 if not is_consistent: msg_lines = ["Torch version mismatch detected:"] + [f" {k}: {v}" for k, v in sorted(versions.items())] From 818d5526d293c068e7498065ed44525e3eb42ff3 Mon Sep 17 00:00:00 2001 From: Yuhan Zhou Date: Wed, 4 Mar 2026 16:53:46 +0800 Subject: [PATCH 02/10] fix(dev): explicitly add pytz as dependency to avoid import error in duckDB (e.g. with python>=3.11) --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 4553e66ee..f8661d4a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "grpcio>=1.56.0,<2", "grpcio-health-checking>=1.56.0,<2", "pandas>=2.2.3", + "pytz>=2025.2", "prometheus-client>=0.21.1", "py-grpc-prometheus>=0.8.0", "torch==2.8.0+cu128", From 7bc90756f0282ca7f618bbe2be2aea7e90b4d8b5 Mon Sep 17 00:00:00 2001 From: Yuhan Zhou Date: Wed, 11 Mar 2026 22:31:40 +0800 Subject: [PATCH 03/10] fix(store): 1) add more logging points for strided load in byte_range_mapped_source; 2) reuse StridedBlock cache; 3) add direct gather path for CPU mem in strided load --- core/common/memory/streaming_pinned_buffer.cc | 32 +- .../dataplane/contracts/source.h | 5 + .../materialization/dataplane/runtime/pump.cc | 31 ++ .../sources/byte_range_mapped_source.cc | 435 +++++++++++++++++- .../sources/byte_range_mapped_source.h | 9 +- .../dataplane/sources/memory_source.cc | 34 ++ .../dataplane/sources/memory_source.h | 4 + .../dataplane/view/view_plan_source.cc | 8 + .../dataplane/view/view_plan_source.h | 1 + 9 files changed, 547 insertions(+), 12 deletions(-) diff --git a/core/common/memory/streaming_pinned_buffer.cc b/core/common/memory/streaming_pinned_buffer.cc index c950e68cb..0147d161e 100644 --- a/core/common/memory/streaming_pinned_buffer.cc +++ b/core/common/memory/streaming_pinned_buffer.cc @@ -2,6 +2,8 @@ #include "core/common/memory/streaming_pinned_buffer.h" +#include + #include "absl/log/log.h" #include "absl/strings/str_cat.h" #include "absl/synchronization/mutex.h" @@ -261,6 +263,12 @@ absl::StatusOr StreamingPinnedBuffer::get_free_chunk() { LOG(INFO) << "StreamingPinnedBuffer wait resolved after " << absl::FormatDuration(absl::Now() - wait_start) << ": slot=" << slot_id << " now available"; } + if (wait_started) { + VLOG(2) << "streaming_buffer.get_free_chunk.wait_resolved slot=" << slot_id << " wait_us=" + << static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - wait_start))) + << " free_queue_size=" << free_queue_.size() << " ready_queue_size=" << ready_queue_.size() + << " chunks_produced=" << chunks_produced_ << " chunks_consumed=" << chunks_consumed_; + } return slot_id; } @@ -358,6 +366,10 @@ absl::Status StreamingPinnedBuffer::mark_chunk_ready(int slot_id, size_t global_ set_slot_state_unsafe(slot_id, SlotState::kReady); chunks_produced_++; ready_cv_.Signal(); + VLOG(2) << "streaming_buffer.mark_chunk_ready slot=" << slot_id << " chunk_id=" << global_chunk_id + << " bytes=" << bytes_in_chunk << " ready_queue_size=" << ready_queue_.size() + << " free_queue_size=" << free_queue_.size() << " chunks_produced=" << chunks_produced_ + << " chunks_consumed=" << chunks_consumed_; return absl::OkStatus(); } @@ -369,7 +381,13 @@ absl::StatusOr StreamingPinnedBuffer::get_rea } // Wait for a ready chunk or production complete + absl::Time wait_start; + bool wait_started = false; while (ready_queue_.empty() && !production_complete_) { + if (!wait_started) { + wait_started = true; + wait_start = absl::Now(); + } ready_cv_.Wait(&mutex_); } @@ -392,6 +410,13 @@ absl::StatusOr StreamingPinnedBuffer::get_rea } set_slot_state_unsafe(chunk.slot_id, SlotState::kConsumerOwned); slot_chunk_ids_[chunk.slot_id] = chunk.global_chunk_id; + if (wait_started) { + VLOG(2) << "streaming_buffer.get_ready_chunk.wait_resolved slot=" << chunk.slot_id + << " chunk_id=" << chunk.global_chunk_id << " wait_us=" + << static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - wait_start))) + << " ready_queue_size=" << ready_queue_.size() << " chunks_produced=" << chunks_produced_ + << " chunks_consumed=" << chunks_consumed_; + } return chunk; } @@ -420,6 +445,9 @@ absl::Status StreamingPinnedBuffer::return_chunk(int slot_id) { free_queue_.push(slot_id); chunks_consumed_++; free_cv_.Signal(); + VLOG(2) << "streaming_buffer.return_chunk slot=" << slot_id << " free_queue_size=" << free_queue_.size() + << " ready_queue_size=" << ready_queue_.size() << " chunks_produced=" << chunks_produced_ + << " chunks_consumed=" << chunks_consumed_; return absl::OkStatus(); } @@ -435,7 +463,9 @@ void StreamingPinnedBuffer::signal_production_complete() { production_complete_ = true; ready_cv_.SignalAll(); free_cv_.SignalAll(); - VLOG(1) << "Production complete. Total chunks produced: " << chunks_produced_; + VLOG(1) << "Production complete. Total chunks produced: " << chunks_produced_ + << " chunks_consumed=" << chunks_consumed_ << " free_queue_size=" << free_queue_.size() + << " ready_queue_size=" << ready_queue_.size(); } bool StreamingPinnedBuffer::is_consumption_complete() const { diff --git a/core/store/materialization/dataplane/contracts/source.h b/core/store/materialization/dataplane/contracts/source.h index 2129f497e..5639f9fc3 100644 --- a/core/store/materialization/dataplane/contracts/source.h +++ b/core/store/materialization/dataplane/contracts/source.h @@ -23,6 +23,11 @@ class SeekableSource : public Source { virtual absl::StatusOr read_at(uint64_t offset, void* dst, size_t bytes) = 0; + // Optional raw CPU memory pointer for fast-path gather. + [[nodiscard]] virtual const uint8_t* cpu_base_ptr() const { + return nullptr; + } + // Optional zero-copy capability: direct write into destination address space. // Default implementations disable the feature. [[nodiscard]] virtual bool supports_direct_write_at() const { diff --git a/core/store/materialization/dataplane/runtime/pump.cc b/core/store/materialization/dataplane/runtime/pump.cc index 3640bcc33..68f12c302 100644 --- a/core/store/materialization/dataplane/runtime/pump.cc +++ b/core/store/materialization/dataplane/runtime/pump.cc @@ -2,6 +2,7 @@ #include "core/store/materialization/dataplane/runtime/pump.h" +#include #include #include #include @@ -331,6 +332,13 @@ void run_range_producer( absl::Span> ranges, std::atomic& range_index, PumpState& state) { + const absl::Time producer_start = absl::Now(); + uint64_t produced_chunks = 0; + uint64_t produced_bytes = 0; + uint64_t wait_free_chunk_us_total = 0; + uint64_t read_at_us_total = 0; + uint64_t mark_ready_us_total = 0; + const auto fail_producer = [&](absl::Status status) { absl::MutexLock lock(&state.status_mutex); if (state.producer_status.ok()) { @@ -352,7 +360,11 @@ void run_range_producer( uint64_t current_offset = offset; while (remaining > 0 && !state.should_stop.load(std::memory_order_acquire)) { + const absl::Time wait_free_start = absl::Now(); auto slot_result = pool.get_free_chunk(); + const uint64_t wait_free_us = + static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - wait_free_start))); + wait_free_chunk_us_total += wait_free_us; if (!slot_result.ok()) { fail_producer(slot_result.status()); break; @@ -375,7 +387,11 @@ void run_range_producer( break; } + const absl::Time read_start = absl::Now(); auto read_result = src.read_at(current_offset, buffer, to_read); + const uint64_t read_us = + static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - read_start))); + read_at_us_total += read_us; if (!read_result.ok()) { fail_producer(read_result.status()); break; @@ -409,7 +425,11 @@ void run_range_producer( state.chunk_offsets.emplace(chunk_id, current_offset); } + const absl::Time mark_ready_start = absl::Now(); auto status = pool.mark_chunk_ready(slot_id, chunk_id, bytes_read); + const uint64_t mark_ready_us = + static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - mark_ready_start))); + mark_ready_us_total += mark_ready_us; if (!status.ok()) { record_copy_failure("cpu"); fail_producer(status); @@ -421,8 +441,19 @@ void run_range_producer( remaining -= bytes_read; // Transfer ownership to consumer; avoid returning the slot here lease.release(); + produced_chunks += 1; + produced_bytes += bytes_read; + VLOG(2) << "pump_producer_chunk range_index=" << idx << " chunk_id=" << chunk_id << " slot=" << slot_id + << " src_offset=" << (current_offset - bytes_read) << " bytes=" << bytes_read + << " wait_free_chunk_us=" << wait_free_us << " read_at_us=" << read_us + << " mark_chunk_ready_us=" << mark_ready_us << " remaining_in_range=" << remaining; } } + VLOG(2) << "pump_producer_summary ranges=" << ranges.size() << " produced_chunks=" << produced_chunks + << " produced_bytes=" << produced_bytes << " wait_free_chunk_us_total=" << wait_free_chunk_us_total + << " read_at_us_total=" << read_at_us_total << " mark_chunk_ready_us_total=" << mark_ready_us_total + << " duration_us=" + << static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - producer_start))); } } // namespace diff --git a/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc b/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc index 3b5dcd51e..f524cdfa5 100644 --- a/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc +++ b/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc @@ -3,10 +3,12 @@ #include "core/store/materialization/dataplane/sources/byte_range_mapped_source.h" #include +#include #include #include #include #include +#include #include #include "absl/base/thread_annotations.h" @@ -105,11 +107,28 @@ void record_histogram( } } +const char* run_kind_to_cstr(ByteRangeRun::Kind kind) { + switch (kind) { + case ByteRangeRun::Kind::kPad: + return "pad"; + case ByteRangeRun::Kind::kContiguous: + return "contiguous"; + case ByteRangeRun::Kind::kStrided: + return "strided"; + } + return "unknown"; +} + +uint64_t elapsed_us(std::chrono::steady_clock::time_point start) { + return static_cast( + std::chrono::duration_cast(std::chrono::steady_clock::now() - start).count()); +} + } // namespace struct ByteRangeMappedSource::StridedBlockCache { absl::Mutex mutex; - std::shared_ptr block ABSL_GUARDED_BY(mutex); + std::shared_ptr block ABSL_GUARDED_BY(mutex); }; absl::StatusOr> ByteRangeMappedSource::Create( @@ -256,21 +275,31 @@ absl::StatusOr ByteRangeMappedSource::read_base( uint64_t offset, uint8_t* dst, size_t bytes) { + const auto start = std::chrono::steady_clock::now(); if (bytes == 0) { + VLOG(2) << "byte_range.read_base bytes=0 source_index=" << source_index << " offset=" << offset << " duration_us=0"; return static_cast(0); } if (source_index >= sources_.size()) { + VLOG(2) << "byte_range.read_base invalid_source source_index=" << source_index + << " source_count=" << sources_.size() << " offset=" << offset << " bytes=" << bytes; return absl::InvalidArgumentError("ByteRangeMappedSource source index out of range"); } stats_.base_read_calls.fetch_add(1, std::memory_order_relaxed); auto read_or = sources_[source_index]->read_at(offset, dst, bytes); if (!read_or.ok()) { + VLOG(2) << "byte_range.read_base read_failed source_index=" << source_index << " offset=" << offset + << " bytes=" << bytes << " duration_us=" << elapsed_us(start) << " status=" << read_or.status(); return read_or.status(); } stats_.base_read_bytes.fetch_add(*read_or, std::memory_order_relaxed); if (*read_or != bytes) { + VLOG(2) << "byte_range.read_base short_read source_index=" << source_index << " offset=" << offset + << " requested=" << bytes << " got=" << *read_or << " duration_us=" << elapsed_us(start); return absl::DataLossError("short read while executing byte range program"); } + VLOG(2) << "byte_range.read_base ok source_index=" << source_index << " offset=" << offset << " bytes=" << bytes + << " duration_us=" << elapsed_us(start); return *read_or; } @@ -279,31 +308,76 @@ absl::StatusOr ByteRangeMappedSource::copy_from_strided_rows( uint64_t run_offset, uint8_t* dst, size_t bytes) { + const auto start = std::chrono::steady_clock::now(); if (bytes == 0) { + VLOG(2) << "byte_range.strided.row_copy bytes=0 source_index=" << run.source_index << " run_offset=" << run_offset + << " duration_us=0"; return static_cast(0); } + if (run.source_index >= sources_.size()) { + VLOG(2) << "byte_range.strided.row_copy invalid_source source_index=" << run.source_index + << " source_count=" << sources_.size() << " run_offset=" << run_offset << " bytes=" << bytes; + return absl::InvalidArgumentError("ByteRangeMappedSource source index out of range"); + } if (run.row_len == 0) { + VLOG(2) << "byte_range.strided.row_copy invalid_row_len source_index=" << run.source_index + << " run_offset=" << run_offset << " bytes=" << bytes; return absl::InternalError("invalid strided run row length"); } + const auto& source = sources_[run.source_index]; const uint64_t first_row = run_offset / run.row_len; uint64_t row_offset = run_offset % run.row_len; uint64_t row = first_row; size_t remaining = bytes; uint8_t* out = dst; + uint64_t local_base_read_calls = 0; + uint64_t local_base_read_bytes = 0; while (remaining > 0 && row < run.rows) { const size_t available = static_cast(run.row_len - row_offset); const size_t take = std::min(remaining, available); const uint64_t src_offset = run.src_base + row * run.stride + row_offset; - auto copied_or = read_base(run.source_index, src_offset, out, take); + ++local_base_read_calls; + auto copied_or = source->read_at(src_offset, out, take); if (!copied_or.ok()) { + if (local_base_read_calls != 0) { + stats_.base_read_calls.fetch_add(local_base_read_calls, std::memory_order_relaxed); + } + if (local_base_read_bytes != 0) { + stats_.base_read_bytes.fetch_add(local_base_read_bytes, std::memory_order_relaxed); + } + VLOG(2) << "byte_range.strided.row_copy read_failed source_index=" << run.source_index << " row=" << row + << " src_offset=" << src_offset << " bytes=" << take << " duration_us=" << elapsed_us(start) + << " status=" << copied_or.status(); return copied_or.status(); } + if (*copied_or != take) { + if (local_base_read_calls != 0) { + stats_.base_read_calls.fetch_add(local_base_read_calls, std::memory_order_relaxed); + } + if (local_base_read_bytes != 0) { + stats_.base_read_bytes.fetch_add(local_base_read_bytes, std::memory_order_relaxed); + } + VLOG(2) << "byte_range.strided.row_copy short_read source_index=" << run.source_index << " row=" << row + << " src_offset=" << src_offset << " requested=" << take << " got=" << *copied_or + << " duration_us=" << elapsed_us(start); + return absl::DataLossError("short read while executing strided row copy"); + } + local_base_read_bytes += *copied_or; out += *copied_or; remaining -= *copied_or; row_offset = 0; ++row; } + if (local_base_read_calls != 0) { + stats_.base_read_calls.fetch_add(local_base_read_calls, std::memory_order_relaxed); + } + if (local_base_read_bytes != 0) { + stats_.base_read_bytes.fetch_add(local_base_read_bytes, std::memory_order_relaxed); + } + VLOG(2) << "byte_range.strided.row_copy done source_index=" << run.source_index << " first_row=" << first_row + << " bytes=" << (bytes - remaining) << " rows_touched=" << (row > first_row ? (row - first_row) : 0) + << " duration_us=" << elapsed_us(start); return bytes - remaining; } @@ -312,57 +386,266 @@ absl::StatusOr ByteRangeMappedSource::fill_strided_run( const ByteRangeRun& run, uint64_t run_offset, uint8_t* dst, - size_t bytes) { + size_t bytes, + uint64_t* pack_us_total, + size_t* pack_bytes_total, + uint64_t* cache_lookup_us_total, + uint64_t* block_prepare_us_total, + uint64_t* block_load_us_total, + uint64_t* row_copy_us_total, + size_t* row_copy_bytes_total) { + const auto run_start = std::chrono::steady_clock::now(); + uint64_t local_pack_us_total = 0; + size_t local_pack_bytes_total = 0; + uint64_t local_cache_lookup_us_total = 0; + uint64_t local_block_prepare_us_total = 0; + uint64_t local_block_load_us_total = 0; + uint64_t local_row_copy_us_total = 0; + size_t local_row_copy_bytes_total = 0; + uint64_t local_block_pick_us_total = 0; + uint64_t local_block_resize_us_total = 0; + uint64_t local_pack_memcpy_calls = 0; + size_t local_cache_hit_count = 0; + size_t local_cache_miss_count = 0; + size_t local_block_load_count = 0; + size_t local_block_reuse_count = 0; + size_t local_block_new_count = 0; + size_t local_block_bytes_total = 0; + auto flush_local_stats = [&]() { + if (pack_us_total != nullptr) { + *pack_us_total += local_pack_us_total; + } + if (pack_bytes_total != nullptr) { + *pack_bytes_total += local_pack_bytes_total; + } + if (cache_lookup_us_total != nullptr) { + *cache_lookup_us_total += local_cache_lookup_us_total; + } + if (block_prepare_us_total != nullptr) { + *block_prepare_us_total += local_block_prepare_us_total; + } + if (block_load_us_total != nullptr) { + *block_load_us_total += local_block_load_us_total; + } + if (row_copy_us_total != nullptr) { + *row_copy_us_total += local_row_copy_us_total; + } + if (row_copy_bytes_total != nullptr) { + *row_copy_bytes_total += local_row_copy_bytes_total; + } + }; + VLOG(2) << "byte_range.strided.begin run_index=" << run_index << " source_index=" << run.source_index + << " run_offset=" << run_offset << " bytes=" << bytes << " row_len=" << run.row_len + << " stride=" << run.stride << " rows=" << run.rows; if (bytes == 0) { + VLOG(2) << "byte_range.strided.end run_index=" << run_index << " copied=0 duration_us=0"; + flush_local_stats(); return static_cast(0); } if (!strided_cache_) { - return copy_from_strided_rows(run, run_offset, dst, bytes); + const auto row_copy_start = std::chrono::steady_clock::now(); + auto copied_or = copy_from_strided_rows(run, run_offset, dst, bytes); + const uint64_t row_copy_us = elapsed_us(row_copy_start); + local_row_copy_us_total += row_copy_us; + if (copied_or.ok()) { + local_row_copy_bytes_total += *copied_or; + } + if (copied_or.ok()) { + VLOG(2) << "byte_range.strided.end run_index=" << run_index << " mode=no_cache copied=" << *copied_or + << " row_copy_us=" << row_copy_us << " duration_us=" << elapsed_us(run_start); + } else { + VLOG(2) << "byte_range.strided.end run_index=" << run_index << " mode=no_cache status=" << copied_or.status() + << " row_copy_us=" << row_copy_us << " duration_us=" << elapsed_us(run_start); + } + flush_local_stats(); + return copied_or; } if (strided_disabled_ && strided_disabled_[run_index].load(std::memory_order_relaxed) != 0) { - return copy_from_strided_rows(run, run_offset, dst, bytes); + const auto row_copy_start = std::chrono::steady_clock::now(); + auto copied_or = copy_from_strided_rows(run, run_offset, dst, bytes); + const uint64_t row_copy_us = elapsed_us(row_copy_start); + local_row_copy_us_total += row_copy_us; + if (copied_or.ok()) { + local_row_copy_bytes_total += *copied_or; + } + if (copied_or.ok()) { + VLOG(2) << "byte_range.strided.end run_index=" << run_index << " mode=disabled copied=" << *copied_or + << " row_copy_us=" << row_copy_us << " duration_us=" << elapsed_us(run_start); + } else { + VLOG(2) << "byte_range.strided.end run_index=" << run_index << " mode=disabled status=" << copied_or.status() + << " row_copy_us=" << row_copy_us << " duration_us=" << elapsed_us(run_start); + } + flush_local_stats(); + return copied_or; } if (run.row_len == 0 || run.rows == 0) { + VLOG(2) << "byte_range.strided.invalid run_index=" << run_index << " row_len=" << run.row_len + << " rows=" << run.rows; + flush_local_stats(); return absl::InternalError("invalid strided run"); } + constexpr uint64_t kDirectGatherMinRowLenBytes = 4 * 1024; + constexpr size_t kDirectGatherMinTotalBytes = 4 * 1024 * 1024; + constexpr uint64_t kDirectGatherMaxRowsTouched = 12 * 1024; + const uint64_t first_row = run_offset / run.row_len; + const uint64_t last_offset_exclusive = run_offset + static_cast(bytes); + const uint64_t last_row_exclusive = (last_offset_exclusive + run.row_len - 1) / run.row_len; + const uint64_t rows_touched = last_row_exclusive > first_row ? (last_row_exclusive - first_row) : 0; + auto& source = sources_[run.source_index]; + const uint8_t* cpu_source_base_ptr = source->cpu_base_ptr(); + VLOG(2) << "byte_range.strided.direct_gather_probe run_index=" << run_index << " source_index=" << run.source_index + << " source_type=" << typeid(*source).name() + << " has_cpu_base_ptr=" << (cpu_source_base_ptr != nullptr ? 1 : 0) << " row_len=" << run.row_len + << " stride=" << run.stride << " bytes=" << bytes << " rows_touched=" << rows_touched; + const bool direct_gather_candidate = run.stride > run.row_len && run.row_len >= kDirectGatherMinRowLenBytes && + bytes >= kDirectGatherMinTotalBytes && rows_touched <= kDirectGatherMaxRowsTouched && + cpu_source_base_ptr != nullptr; + if (direct_gather_candidate) { + const auto gather_start = std::chrono::steady_clock::now(); + const uint8_t* base_ptr = cpu_source_base_ptr; + size_t copied = 0; + uint64_t row = first_row; + uint64_t row_offset = run_offset % run.row_len; + uint8_t* out_ptr = dst; + uint64_t direct_gather_memcpy_calls = 0; + while (copied < bytes && row < run.rows) { + const size_t available = static_cast(run.row_len - row_offset); + const size_t take = std::min(bytes - copied, available); + const uint64_t src_offset = run.src_base + row * run.stride + row_offset; + const uint64_t source_total = sources_[run.source_index]->total_bytes(); + if (src_offset > source_total || static_cast(take) > source_total - src_offset) { + const uint64_t gather_us = elapsed_us(gather_start); + VLOG(2) << "byte_range.strided.direct_gather_oob run_index=" << run_index + << " source_index=" << run.source_index << " src_offset=" << src_offset << " take=" << take + << " source_total=" << source_total << " copied=" << copied << " duration_us=" << gather_us; + flush_local_stats(); + return absl::OutOfRangeError("direct gather source range out of bounds"); + } + std::memcpy(out_ptr, base_ptr + src_offset, take); + ++direct_gather_memcpy_calls; + copied += take; + out_ptr += take; + row_offset = 0; + ++row; + } + const uint64_t gather_us = elapsed_us(gather_start); + local_row_copy_us_total += gather_us; + local_row_copy_bytes_total += copied; + if (copied != bytes) { + VLOG(2) << "byte_range.strided.direct_gather_short_copy run_index=" << run_index + << " source_index=" << run.source_index << " requested=" << bytes << " copied=" << copied + << " rows_touched=" << rows_touched << " duration_us=" << gather_us; + flush_local_stats(); + return absl::DataLossError("direct gather short copy"); + } + stats_.base_read_calls.fetch_add(direct_gather_memcpy_calls, std::memory_order_relaxed); + stats_.base_read_bytes.fetch_add(copied, std::memory_order_relaxed); + VLOG(2) << "byte_range.strided.direct_gather run_index=" << run_index << " source_index=" << run.source_index + << " rows_touched=" << rows_touched << " bytes=" << copied << " memcpy_calls=" << direct_gather_memcpy_calls + << " duration_us=" << gather_us; + flush_local_stats(); + return copied; + } auto load_block = [&](uint64_t row) -> absl::StatusOr> { + const auto cache_lookup_start = std::chrono::steady_clock::now(); if (strided_cache_) { absl::MutexLock lock(&strided_cache_->mutex); const auto cached = strided_cache_->block; if (cached && cached->run_index == run_index && row >= cached->first_row && row < cached->first_row + cached->rows) { + local_cache_lookup_us_total += elapsed_us(cache_lookup_start); stats_.cache_hits.fetch_add(1, std::memory_order_relaxed); + ++local_cache_hit_count; + VLOG(2) << "byte_range.strided.cache_hit run_index=" << run_index << " row=" << row + << " block_first_row=" << cached->first_row << " block_rows=" << cached->rows; return cached; } } + local_cache_lookup_us_total += elapsed_us(cache_lookup_start); stats_.cache_misses.fetch_add(1, std::memory_order_relaxed); + ++local_cache_miss_count; const uint64_t rows_per_block = std::max(run.rows_per_block, 1); const uint64_t block_first_row = row - (row % rows_per_block); uint64_t block_rows = std::min(rows_per_block, run.rows - block_first_row); while (block_rows > 0) { + const auto block_prepare_start = std::chrono::steady_clock::now(); const uint64_t block_bytes = (block_rows - 1) * run.stride + run.row_len; if (block_bytes > program_->strided_block_max_bytes) { + local_block_prepare_us_total += elapsed_us(block_prepare_start); block_rows /= 2; continue; } - auto block = std::make_shared(); + std::shared_ptr block; + bool reused_block = false; + const auto block_pick_start = std::chrono::steady_clock::now(); + if (strided_cache_) { + absl::MutexLock lock(&strided_cache_->mutex); + if (strided_cache_->block && strided_cache_->block.use_count() == 1) { + block = strided_cache_->block; + strided_cache_->block.reset(); + reused_block = true; + } + } + if (!block) { + block = std::make_shared(); + } + local_block_pick_us_total += elapsed_us(block_pick_start); + if (reused_block) { + ++local_block_reuse_count; + } else { + ++local_block_new_count; + } block->run_index = run_index; block->first_row = block_first_row; block->rows = block_rows; block->src_begin = run.src_base + block_first_row * run.stride; try { - block->data.resize(static_cast(block_bytes)); + const auto block_resize_start = std::chrono::steady_clock::now(); + const size_t target_bytes = static_cast(block_bytes); + if (block->data.capacity() < target_bytes) { + block->data.reserve(target_bytes); + } + if (block->data.size() != target_bytes) { + block->data.resize(target_bytes); + } + local_block_resize_us_total += elapsed_us(block_resize_start); } catch (const std::bad_alloc&) { + local_block_prepare_us_total += elapsed_us(block_prepare_start); + VLOG(2) << "byte_range.strided.block_alloc_retry run_index=" << run_index + << " block_first_row=" << block_first_row << " block_rows=" << block_rows + << " block_bytes=" << block_bytes; block_rows /= 2; continue; } + local_block_prepare_us_total += elapsed_us(block_prepare_start); + const auto block_read_start = std::chrono::steady_clock::now(); auto read_or = read_base(run.source_index, block->src_begin, block->data.data(), block->data.size()); + const uint64_t block_read_us = elapsed_us(block_read_start); + local_block_load_us_total += block_read_us; if (!read_or.ok()) { + VLOG(2) << "byte_range.strided.cache_miss_read_failed run_index=" << run_index + << " block_first_row=" << block_first_row << " block_rows=" << block_rows + << " block_bytes=" << block->data.size() << " duration_us=" << block_read_us + << " status=" << read_or.status(); return read_or.status(); } + VLOG(2) << "byte_range.strided.cache_miss_loaded run_index=" << run_index + << " block_first_row=" << block_first_row << " block_rows=" << block_rows + << " block_bytes=" << block->data.size() << " duration_us=" << block_read_us; + ++local_block_load_count; + local_block_bytes_total += block->data.size(); + const double load_mib = static_cast(block->data.size()) / (1024.0 * 1024.0); + const double load_seconds = static_cast(block_read_us) / 1e6; + const double throughput_mib_s = load_seconds > 0.0 ? (load_mib / load_seconds) : 0.0; + VLOG(2) << "byte_range.strided.block_summary run_index=" << run_index << " source_index=" << run.source_index + << " block_first_row=" << block_first_row << " block_rows=" << block_rows + << " block_bytes=" << block->data.size() << " reused_block=" << (reused_block ? 1 : 0) + << " block_pick_us=" << local_block_pick_us_total << " block_resize_us=" << local_block_resize_us_total + << " block_prepare_us=" << local_block_prepare_us_total << " block_load_us=" << block_read_us + << " block_load_throughput_mib_s=" << throughput_mib_s; if (strided_cache_) { absl::MutexLock lock(&strided_cache_->mutex); strided_cache_->block = block; @@ -394,23 +677,41 @@ absl::StatusOr ByteRangeMappedSource::fill_strided_run( } } const size_t already_copied = bytes - remaining; + const auto fallback_start = std::chrono::steady_clock::now(); auto fallback_or = copy_from_strided_rows(run, local_offset, out, remaining); + const uint64_t fallback_us = elapsed_us(fallback_start); + local_row_copy_us_total += fallback_us; if (!fallback_or.ok()) { + VLOG(2) << "byte_range.strided.fallback_failed run_index=" << run_index + << " already_copied=" << already_copied << " remaining=" << remaining + << " duration_us=" << fallback_us << " status=" << fallback_or.status(); + flush_local_stats(); return fallback_or.status(); } + local_row_copy_bytes_total += *fallback_or; + VLOG(2) << "byte_range.strided.fallback_done run_index=" << run_index << " already_copied=" << already_copied + << " fallback_copied=" << *fallback_or << " duration_us=" << fallback_us; + flush_local_stats(); return already_copied + *fallback_or; } + VLOG(2) << "byte_range.strided.block_load_failed run_index=" << run_index << " local_offset=" << local_offset + << " remaining=" << remaining << " status=" << block_or.status(); + flush_local_stats(); return block_or.status(); } const auto& block = *block_or; const uint64_t block_end_row = block->first_row + block->rows; uint64_t active_row = row; + const auto pack_start = std::chrono::steady_clock::now(); + size_t packed_bytes = 0; while (remaining > 0 && active_row < block_end_row) { const size_t available = static_cast(run.row_len - row_offset); const size_t take = std::min(remaining, available); const uint64_t block_offset = (active_row - block->first_row) * run.stride + row_offset; std::memcpy(out, block->data.data() + block_offset, take); + ++local_pack_memcpy_calls; stats_.pack_bytes.fetch_add(take, std::memory_order_relaxed); + packed_bytes += take; out += take; remaining -= take; local_offset += take; @@ -420,8 +721,27 @@ absl::StatusOr ByteRangeMappedSource::fill_strided_run( row_offset = 0; } } - } - + VLOG(2) << "byte_range.strided.pack_block run_index=" << run_index << " block_first_row=" << block->first_row + << " block_rows=" << block->rows << " packed_bytes=" << packed_bytes + << " duration_us=" << elapsed_us(pack_start); + local_pack_us_total += elapsed_us(pack_start); + local_pack_bytes_total += packed_bytes; + } + + VLOG(2) << "byte_range.strided.run_summary run_index=" << run_index << " source_index=" << run.source_index + << " requested_bytes=" << bytes << " copied_bytes=" << bytes << " cache_hits=" << local_cache_hit_count + << " cache_misses=" << local_cache_miss_count << " blocks_loaded=" << local_block_load_count + << " blocks_reused=" << local_block_reuse_count << " blocks_new=" << local_block_new_count + << " block_bytes_total=" << local_block_bytes_total << " block_pick_us_total=" << local_block_pick_us_total + << " block_resize_us_total=" << local_block_resize_us_total + << " block_prepare_us_total=" << local_block_prepare_us_total + << " block_load_us_total=" << local_block_load_us_total << " pack_us_total=" << local_pack_us_total + << " pack_memcpy_calls=" << local_pack_memcpy_calls << " pack_bytes_total=" << local_pack_bytes_total + << " row_copy_us_total=" << local_row_copy_us_total << " total_us=" << elapsed_us(run_start); + + VLOG(2) << "byte_range.strided.end run_index=" << run_index << " copied=" << bytes + << " duration_us=" << elapsed_us(run_start); + flush_local_stats(); return bytes; } @@ -435,7 +755,23 @@ absl::StatusOr ByteRangeMappedSource::read(void* dst, size_t max_bytes) } absl::StatusOr ByteRangeMappedSource::read_at(uint64_t offset, void* dst, size_t bytes) { + const auto read_start = std::chrono::steady_clock::now(); + uint64_t pad_us_total = 0; + uint64_t contiguous_us_total = 0; + uint64_t strided_us_total = 0; + uint64_t strided_pack_us_total = 0; + uint64_t strided_cache_lookup_us_total = 0; + uint64_t strided_block_prepare_us_total = 0; + uint64_t strided_block_load_us_total = 0; + uint64_t strided_row_copy_us_total = 0; + size_t pad_bytes_total = 0; + size_t contiguous_bytes_total = 0; + size_t strided_bytes_total = 0; + size_t strided_pack_bytes_total = 0; + size_t strided_row_copy_bytes_total = 0; if (offset >= program_->total_bytes || bytes == 0) { + VLOG(2) << "byte_range.read_at trivial offset=" << offset << " bytes=" << bytes + << " total_bytes=" << program_->total_bytes << " copied=0 duration_us=0"; return static_cast(0); } const uint64_t remaining_bytes = program_->total_bytes - offset; @@ -445,19 +781,30 @@ absl::StatusOr ByteRangeMappedSource::read_at(uint64_t offset, void* dst uint64_t cursor = offset; size_t run_index = find_run_index(offset); + VLOG(2) << "byte_range.read_at.begin offset=" << offset << " req_bytes=" << bytes << " to_copy=" << to_copy + << " run_index=" << run_index << " total_runs=" << program_->runs.size(); while (run_index < program_->runs.size() && copied < to_copy) { const auto& run = program_->runs[run_index]; + const auto run_start = std::chrono::steady_clock::now(); if (cursor >= run.dst_end) { + VLOG(2) << "byte_range.read_at.skip run_index=" << run_index << " kind=" << run_kind_to_cstr(run.kind) + << " cursor=" << cursor << " dst_end=" << run.dst_end; ++run_index; continue; } if (cursor < run.dst_begin) { + VLOG(2) << "byte_range.read_at.uncovered_gap run_index=" << run_index << " cursor=" << cursor + << " dst_begin=" << run.dst_begin << " duration_us=" << elapsed_us(run_start); return absl::InternalError("byte range program contains uncovered gaps"); } const uint64_t run_offset = cursor - run.dst_begin; const size_t available = static_cast(run.dst_end - cursor); const size_t chunk = std::min(to_copy - copied, available); + VLOG(2) << "byte_range.read_at.run_begin run_index=" << run_index << " kind=" << run_kind_to_cstr(run.kind) + << " source_index=" << run.source_index << " run_offset=" << run_offset << " chunk=" << chunk + << " dst_begin=" << run.dst_begin << " dst_end=" << run.dst_end; absl::StatusOr copied_or; + const auto kind_start = std::chrono::steady_clock::now(); switch (run.kind) { case ByteRangeRun::Kind::kPad: std::memset(out + copied, 0, chunk); @@ -467,22 +814,90 @@ absl::StatusOr ByteRangeMappedSource::read_at(uint64_t offset, void* dst case ByteRangeRun::Kind::kContiguous: copied_or = read_base(run.source_index, run.src_begin + run_offset, out + copied, chunk); break; + case ByteRangeRun::Kind::kStrided: { + uint64_t strided_pack_us = 0; + size_t strided_pack_bytes = 0; + uint64_t strided_cache_lookup_us = 0; + uint64_t strided_block_prepare_us = 0; + uint64_t strided_block_load_us = 0; + uint64_t strided_row_copy_us = 0; + size_t strided_row_copy_bytes = 0; + copied_or = fill_strided_run( + run_index, + run, + run_offset, + out + copied, + chunk, + &strided_pack_us, + &strided_pack_bytes, + &strided_cache_lookup_us, + &strided_block_prepare_us, + &strided_block_load_us, + &strided_row_copy_us, + &strided_row_copy_bytes); + strided_pack_us_total += strided_pack_us; + strided_pack_bytes_total += strided_pack_bytes; + strided_cache_lookup_us_total += strided_cache_lookup_us; + strided_block_prepare_us_total += strided_block_prepare_us; + strided_block_load_us_total += strided_block_load_us; + strided_row_copy_us_total += strided_row_copy_us; + strided_row_copy_bytes_total += strided_row_copy_bytes; + break; + } + } + const uint64_t kind_us = elapsed_us(kind_start); + switch (run.kind) { + case ByteRangeRun::Kind::kPad: + pad_us_total += kind_us; + break; + case ByteRangeRun::Kind::kContiguous: + contiguous_us_total += kind_us; + break; case ByteRangeRun::Kind::kStrided: - copied_or = fill_strided_run(run_index, run, run_offset, out + copied, chunk); + strided_us_total += kind_us; break; } if (!copied_or.ok()) { + VLOG(2) << "byte_range.read_at.run_failed run_index=" << run_index << " kind=" << run_kind_to_cstr(run.kind) + << " chunk=" << chunk << " duration_us=" << elapsed_us(run_start) << " status=" << copied_or.status(); return copied_or.status(); } + VLOG(2) << "byte_range.read_at.run_end run_index=" << run_index << " kind=" << run_kind_to_cstr(run.kind) + << " copied_chunk=" << *copied_or << " duration_us=" << elapsed_us(run_start); + switch (run.kind) { + case ByteRangeRun::Kind::kPad: + pad_bytes_total += *copied_or; + break; + case ByteRangeRun::Kind::kContiguous: + contiguous_bytes_total += *copied_or; + break; + case ByteRangeRun::Kind::kStrided: + strided_bytes_total += *copied_or; + break; + } copied += *copied_or; cursor += *copied_or; if (*copied_or == 0) { + VLOG(2) << "byte_range.read_at.stop_zero_progress run_index=" << run_index; break; } ++run_index; } stats_.output_bytes.fetch_add(copied, std::memory_order_relaxed); + VLOG(2) << "byte_range.read_at.end offset=" << offset << " req_bytes=" << bytes << " to_copy=" << to_copy + << " copied=" << copied << " duration_us=" << elapsed_us(read_start); + VLOG(2) << "byte_range.read_at.summary offset=" << offset << " req_bytes=" << bytes << " copied=" << copied + << " duration_us=" << elapsed_us(read_start) << " pad_us_total=" << pad_us_total + << " contiguous_us_total=" << contiguous_us_total << " strided_us_total=" << strided_us_total + << " strided_pack_us_total=" << strided_pack_us_total + << " strided_cache_lookup_us_total=" << strided_cache_lookup_us_total + << " strided_block_prepare_us_total=" << strided_block_prepare_us_total + << " strided_block_load_us_total=" << strided_block_load_us_total + << " strided_row_copy_us_total=" << strided_row_copy_us_total << " pad_bytes_total=" << pad_bytes_total + << " contiguous_bytes_total=" << contiguous_bytes_total << " strided_bytes_total=" << strided_bytes_total + << " strided_pack_bytes_total=" << strided_pack_bytes_total + << " strided_row_copy_bytes_total=" << strided_row_copy_bytes_total; return copied; } diff --git a/core/store/materialization/dataplane/sources/byte_range_mapped_source.h b/core/store/materialization/dataplane/sources/byte_range_mapped_source.h index d6f39d3c4..821478f5d 100644 --- a/core/store/materialization/dataplane/sources/byte_range_mapped_source.h +++ b/core/store/materialization/dataplane/sources/byte_range_mapped_source.h @@ -66,7 +66,14 @@ class ByteRangeMappedSource final : public SeekableSource { const ByteRangeRun& run, uint64_t run_offset, uint8_t* dst, - size_t bytes); + size_t bytes, + uint64_t* pack_us_total, + size_t* pack_bytes_total, + uint64_t* cache_lookup_us_total, + uint64_t* block_prepare_us_total, + uint64_t* block_load_us_total, + uint64_t* row_copy_us_total, + size_t* row_copy_bytes_total); absl::Status zero_fill_to_grant(uint64_t dest_va_offset, size_t bytes, const DirectWriteGrant& grant); struct Stats { diff --git a/core/store/materialization/dataplane/sources/memory_source.cc b/core/store/materialization/dataplane/sources/memory_source.cc index 690f1a13e..255a33f1e 100644 --- a/core/store/materialization/dataplane/sources/memory_source.cc +++ b/core/store/materialization/dataplane/sources/memory_source.cc @@ -3,13 +3,24 @@ #include "core/store/materialization/dataplane/sources/memory_source.h" #include +#include #include +#include "absl/log/log.h" #include "absl/status/status.h" #include "core/cuda/cuda_api.h" namespace tensorcast::store::loader { +namespace { + +uint64_t elapsed_us(std::chrono::steady_clock::time_point start) { + return static_cast( + std::chrono::duration_cast(std::chrono::steady_clock::now() - start).count()); +} + +} // namespace + CpuMemorySource::CpuMemorySource(gsl::not_null base_ptr, uint64_t total_size) : base_ptr_(static_cast(base_ptr.get())), total_size_(total_size) {} @@ -23,11 +34,20 @@ absl::StatusOr CpuMemorySource::read(void* dst, size_t max_bytes) { } absl::StatusOr CpuMemorySource::read_at(uint64_t offset, void* dst, size_t bytes) { + const auto call_start = std::chrono::steady_clock::now(); if (offset >= total_size_ || bytes == 0) { return static_cast(0); } const size_t to_copy = static_cast(std::min(bytes, total_size_ - offset)); + const auto memcpy_start = std::chrono::steady_clock::now(); std::memcpy(dst, base_ptr_ + offset, to_copy); + const uint64_t memcpy_us = elapsed_us(memcpy_start); + const uint64_t total_us = elapsed_us(call_start); + const double copied_mib = static_cast(to_copy) / (1024.0 * 1024.0); + const double total_sec = static_cast(total_us) / 1e6; + const double throughput_mib_s = total_sec > 0.0 ? (copied_mib / total_sec) : 0.0; + VLOG(2) << "seekable.cpu_memory.read_at offset=" << offset << " bytes=" << to_copy << " memcpy_us=" << memcpy_us + << " total_us=" << total_us << " throughput_mib_s=" << throughput_mib_s; return to_copy; } @@ -44,22 +64,36 @@ absl::StatusOr GpuMemorySource::read(void* dst, size_t max_bytes) { } absl::StatusOr GpuMemorySource::read_at(uint64_t offset, void* dst, size_t bytes) { + const auto call_start = std::chrono::steady_clock::now(); if (offset >= total_size_ || bytes == 0) { return static_cast(0); } const size_t to_copy = static_cast(std::min(bytes, total_size_ - offset)); + const auto set_device_start = std::chrono::steady_clock::now(); if (auto st = tensorcast::cuda::set_device(device_id_); !st.ok()) { return st; } + const uint64_t set_device_us = elapsed_us(set_device_start); + const auto memcpy_start = std::chrono::steady_clock::now(); auto st = tensorcast::cuda::memcpy(dst, static_cast(device_ptr_.get()) + offset, to_copy, cudaMemcpyDeviceToHost); if (!st.ok()) { return st; } + const uint64_t memcpy_us = elapsed_us(memcpy_start); + const auto sync_start = std::chrono::steady_clock::now(); auto sync = tensorcast::cuda::device_synchronize(); if (!sync.ok()) { return sync; } + const uint64_t sync_us = elapsed_us(sync_start); + const uint64_t total_us = elapsed_us(call_start); + const double copied_mib = static_cast(to_copy) / (1024.0 * 1024.0); + const double total_sec = static_cast(total_us) / 1e6; + const double throughput_mib_s = total_sec > 0.0 ? (copied_mib / total_sec) : 0.0; + VLOG(2) << "seekable.gpu_memory.read_at offset=" << offset << " bytes=" << to_copy + << " set_device_us=" << set_device_us << " cuda_memcpy_us=" << memcpy_us << " sync_us=" << sync_us + << " total_us=" << total_us << " throughput_mib_s=" << throughput_mib_s; return to_copy; } diff --git a/core/store/materialization/dataplane/sources/memory_source.h b/core/store/materialization/dataplane/sources/memory_source.h index d593474ea..7c88a8c9e 100644 --- a/core/store/materialization/dataplane/sources/memory_source.h +++ b/core/store/materialization/dataplane/sources/memory_source.h @@ -18,6 +18,10 @@ class CpuMemorySource final : public SeekableSource { return total_size_; } + [[nodiscard]] const uint8_t* cpu_base_ptr() const override { + return base_ptr_; + } + absl::StatusOr read(void* dst, size_t max_bytes) override; absl::StatusOr read_at(uint64_t offset, void* dst, size_t bytes) override; diff --git a/core/store/materialization/dataplane/view/view_plan_source.cc b/core/store/materialization/dataplane/view/view_plan_source.cc index 38885390e..2a6f63827 100644 --- a/core/store/materialization/dataplane/view/view_plan_source.cc +++ b/core/store/materialization/dataplane/view/view_plan_source.cc @@ -48,6 +48,10 @@ absl::StatusOr ViewPlanSource::read_at(uint64_t offset, void* dst, size_ return mapped_source_->read_at(offset, dst, bytes); } +const uint8_t* ViewPlanSource::cpu_base_ptr() const { + return base_->cpu_base_ptr(); +} + bool ViewPlanSource::supports_direct_write_at() const { return mapped_source_->supports_direct_write_at(); } @@ -91,6 +95,10 @@ std::unique_ptr make_view_plan_source( return adapter_.read_at(offset, dst, bytes); } + [[nodiscard]] const uint8_t* cpu_base_ptr() const override { + return base_->cpu_base_ptr(); + } + [[nodiscard]] bool supports_direct_write_at() const override { return adapter_.supports_direct_write_at(); } diff --git a/core/store/materialization/dataplane/view/view_plan_source.h b/core/store/materialization/dataplane/view/view_plan_source.h index 596f4bc2c..0ca4b67d0 100644 --- a/core/store/materialization/dataplane/view/view_plan_source.h +++ b/core/store/materialization/dataplane/view/view_plan_source.h @@ -41,6 +41,7 @@ class ViewPlanSource final : public SeekableSource { absl::StatusOr read(void* dst, size_t max_bytes) override; absl::StatusOr read_at(uint64_t offset, void* dst, size_t bytes) override; + [[nodiscard]] const uint8_t* cpu_base_ptr() const override; [[nodiscard]] bool supports_direct_write_at() const override; absl::StatusOr read_into_at( uint64_t src_offset, From 5ffe9e5ee5a43cb973c1db5dd8c178a0d4d9560e Mon Sep 17 00:00:00 2001 From: Yuhan Zhou Date: Wed, 11 Mar 2026 22:33:06 +0800 Subject: [PATCH 04/10] fix(store): add CPU memfd region in local replica handle --- .../ingestion/materialization_facade.cc | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/core/store/runtime/ingestion/materialization_facade.cc b/core/store/runtime/ingestion/materialization_facade.cc index 62dc57e82..175680c78 100644 --- a/core/store/runtime/ingestion/materialization_facade.cc +++ b/core/store/runtime/ingestion/materialization_facade.cc @@ -186,6 +186,28 @@ loading::ReplicaHandle build_local_replica_handle( handle.cpu_state = replica->get_memory_state(common::memory::MemoryLocation::CPU); handle.gpu_state = replica->get_memory_state(common::memory::MemoryLocation::GPU); handle.source = loading::MaterializationSource::kLocalReplica; + if (target_location == common::memory::MemoryLocation::GPU) { + const auto gpu_ptrs = replica->get_data_pointer(common::memory::MemoryLocation::GPU); + handle.gpu_base_ptr = (!gpu_ptrs.empty() && gpu_ptrs[0] != nullptr) ? gpu_ptrs[0] : nullptr; + auto ipc_or = replica->get_memory_manager().get_ipc_handle(); + if (ipc_or.ok()) { + handle.cuda_ipc_handle = cuda::IpcHandleBytes::from_native(*ipc_or); + } + return handle; + } + + auto uma = replica->get_memory_manager().memory_authority(); + if (uma != nullptr) { + const loading::ReplicaKey& allocation_key = replica->replica_key(); + auto region_or = uma->get_cpu_memfd_region(allocation_key); + if (region_or.ok()) { + handle.cpu_memfd_region = loading::CpuMemfdRegion{ + .fd = region_or->fd, + .size_bytes = region_or->size_bytes, + .offset_bytes = region_or->offset_bytes, + }; + } + } return handle; } @@ -370,6 +392,10 @@ class LocalReplicaSource final : public loader::SeekableSource { return source_->read_into_at(src_offset, dest_va_offset, bytes, grant); } + [[nodiscard]] const uint8_t* cpu_base_ptr() const override { + return source_->cpu_base_ptr(); + } + private: LocalReplicaSource(std::shared_ptr replica, std::shared_ptr source) : replica_(std::move(replica)), source_(std::move(source)) {} @@ -983,15 +1009,6 @@ absl::StatusOr MaterializationFacade::materialize_view_f auto build_local_view_handle = [&](const std::shared_ptr& replica_instance) { loading::ReplicaHandle handle = build_local_replica_handle(key, replica_instance, request.target_location()); - if (request.target_is_gpu()) { - const auto gpu_ptrs = replica_instance->get_data_pointer(common::memory::MemoryLocation::GPU); - handle.gpu_base_ptr = (!gpu_ptrs.empty() && gpu_ptrs[0] != nullptr) ? gpu_ptrs[0] : nullptr; - auto ipc_or = replica_instance->get_memory_manager().get_ipc_handle(); - if (ipc_or.ok()) { - handle.cuda_ipc_handle = cuda::IpcHandleBytes::from_native(*ipc_or); - } - } - const auto& replica_view_plan = replica_instance->view_plan(); const loader::ViewPlan* effective_plan = nullptr; if (replica_view_plan.has_value() && !replica_view_plan->is_identity) { @@ -1480,11 +1497,7 @@ absl::StatusOr MaterializationFacade::mate /*num_chunks=*/num_chunks, slice_bytes, config_.runtime_context->pinned_buffer_pool()); auto init_spb_status = session_spb->initialize( timeout, - make_materialize_into_target_pinned_wait_context( - hints, - target_device.ordinal, - num_chunks, - slice_bytes)); + make_materialize_into_target_pinned_wait_context(hints, target_device.ordinal, num_chunks, slice_bytes)); if (!init_spb_status.ok()) { return init_spb_status; } @@ -2839,20 +2852,8 @@ absl::StatusOr MaterializationFacade::assemble_from_piec return emplace_status; } - loading::ReplicaHandle handle; - handle.replica_key = key; - handle.ready_signal = replica->ready_signal_for(request.target_location()); - handle.cpu_state = replica->get_memory_state(common::memory::MemoryLocation::CPU); - handle.gpu_state = replica->get_memory_state(common::memory::MemoryLocation::GPU); + loading::ReplicaHandle handle = build_local_replica_handle(key, replica, request.target_location()); handle.source = loading::MaterializationSource::kP2P; - if (request.target_is_gpu()) { - const auto gpu_ptrs = replica->get_data_pointer(common::memory::MemoryLocation::GPU); - handle.gpu_base_ptr = (!gpu_ptrs.empty() && gpu_ptrs[0] != nullptr) ? gpu_ptrs[0] : nullptr; - auto ipc_or = replica->get_memory_manager().get_ipc_handle(); - if (ipc_or.ok()) { - handle.cuda_ipc_handle = cuda::IpcHandleBytes::from_native(*ipc_or); - } - } const auto& view_plan = replica->view_plan(); if (view_plan.has_value() && !view_plan->is_identity) { handle.view_index_json = view_plan->view_index_json; From c61169da5bee72228c89874b888ddedd259ff7e1 Mon Sep 17 00:00:00 2001 From: i-zhouyuhan Date: Fri, 13 Mar 2026 15:56:21 +0800 Subject: [PATCH 05/10] fix(daemon): make view_data_hash optional to avoid heavy hashing overhead for CPU view export --- core/store/materialization/contracts/loading_spec.h | 1 + core/store/runtime/ingestion/materialization_facade.cc | 4 ++-- core/store/runtime/ingestion/materialization_service.cc | 2 +- .../controllers/replica_materialization_service.cc | 2 ++ proto/tensorcast/daemon/v2/store_daemon.proto | 3 +++ tensorcast/api/_config.py | 1 + tensorcast/api/_materialize.py | 4 ++++ tensorcast/api/store/artifact.py | 8 +------- tensorcast/api/store/view_composer.py | 5 ++--- tensorcast/daemon_ctl.py | 6 ++++++ 10 files changed, 23 insertions(+), 13 deletions(-) diff --git a/core/store/materialization/contracts/loading_spec.h b/core/store/materialization/contracts/loading_spec.h index 2778050cf..a5b45d128 100644 --- a/core/store/materialization/contracts/loading_spec.h +++ b/core/store/materialization/contracts/loading_spec.h @@ -131,6 +131,7 @@ struct MaterializeHints { bool allow_p2p{true}; bool allow_disk{true}; ExportPolicy export_policy{ExportPolicy::kNever}; + bool need_view_data_hash{true}; SourceMutationPolicy source_mutation_policy{SourceMutationPolicy::kReadWrite}; std::optional variant; diff --git a/core/store/runtime/ingestion/materialization_facade.cc b/core/store/runtime/ingestion/materialization_facade.cc index 175680c78..2ee453bf9 100644 --- a/core/store/runtime/ingestion/materialization_facade.cc +++ b/core/store/runtime/ingestion/materialization_facade.cc @@ -1018,7 +1018,7 @@ absl::StatusOr MaterializationFacade::materialize_view_f } if (effective_plan != nullptr) { handle.view_index_json = effective_plan->view_index_json; - if (effective_plan->view_size_bytes > 0) { + if (request.hints().need_view_data_hash && effective_plan->view_size_bytes > 0) { auto computer = config_.runtime_context->view_hash_computer(); if (computer) { auto hash = computer->hash_replica_view( @@ -2858,7 +2858,7 @@ absl::StatusOr MaterializationFacade::assemble_from_piec if (view_plan.has_value() && !view_plan->is_identity) { handle.view_index_json = view_plan->view_index_json; const uint64_t view_size = view_plan->view_size_bytes; - if (view_size > 0) { + if (request.hints().need_view_data_hash && view_size > 0) { auto computer = config_.runtime_context->view_hash_computer(); if (computer) { auto hash = computer->hash_replica_view( diff --git a/core/store/runtime/ingestion/materialization_service.cc b/core/store/runtime/ingestion/materialization_service.cc index 318766316..824cb0644 100644 --- a/core/store/runtime/ingestion/materialization_service.cc +++ b/core/store/runtime/ingestion/materialization_service.cc @@ -539,7 +539,7 @@ ReplicaHandle MaterializationService::build_handle( if (view_plan.has_value() && !view_plan->is_identity) { handle.view_index_json = view_plan->view_index_json; const uint64_t view_size = view_plan->view_size_bytes; - if (view_size > 0 && deps_.view_hash_computer) { + if (request.hints().need_view_data_hash && view_size > 0 && deps_.view_hash_computer) { const bool target_is_gpu = request.target_is_gpu(); const bool target_loaded = target_is_gpu ? handle.gpu_state == MemoryState::LOADED : handle.cpu_state == MemoryState::LOADED; diff --git a/daemon/service/controllers/replica_materialization_service.cc b/daemon/service/controllers/replica_materialization_service.cc index 8a51602b5..76b3647f8 100644 --- a/daemon/service/controllers/replica_materialization_service.cc +++ b/daemon/service/controllers/replica_materialization_service.cc @@ -254,6 +254,7 @@ grpc::Status ReplicaMaterializationService::materialize_replica( span->SetAttribute("tc.store.preference", static_cast(effective_policy.preference)); span->SetAttribute("tc.store.allow_p2p", effective_policy.allow_p2p); span->SetAttribute("tc.store.allow_disk", effective_policy.allow_disk); + span->SetAttribute("tc.view.need_data_hash", req.has_need_view_data_hash() ? req.need_view_data_hash() : true); using v2::MaterializeReplicaStatus; if (d_.shutdown_signal.is_shutting_down()) { @@ -659,6 +660,7 @@ grpc::Status ReplicaMaterializationService::materialize_replica( hints.source_mutation_policy = store::loading::SourceMutationPolicy::kReadOnly; } hints.export_policy = to_hint_export_policy(req.export_policy()); + hints.need_view_data_hash = req.has_need_view_data_hash() ? req.need_view_data_hash() : true; if (has_artifact) hints.artifact_id = resolved_artifact_id; if (disk_metadata.has_value()) { diff --git a/proto/tensorcast/daemon/v2/store_daemon.proto b/proto/tensorcast/daemon/v2/store_daemon.proto index 1520f5e16..2d4fa189d 100644 --- a/proto/tensorcast/daemon/v2/store_daemon.proto +++ b/proto/tensorcast/daemon/v2/store_daemon.proto @@ -401,6 +401,9 @@ message MaterializeReplicaRequest { // location in the local cluster before retrying disk-only. uint32 wait_for_shared_disk_ms = 16; ExportPolicy export_policy = 17; + // When false, the daemon skips computing the realized-view data hash and + // leaves response.view_data_hash unset. Unset preserves legacy behavior. + optional bool need_view_data_hash = 18; } // MaterializeReplica now returns immediately after memory allocation succeeds. diff --git a/tensorcast/api/_config.py b/tensorcast/api/_config.py index 55344cea4..55ef544f5 100644 --- a/tensorcast/api/_config.py +++ b/tensorcast/api/_config.py @@ -634,6 +634,7 @@ class GetArtifactOptions(BaseModel): prefer: str = "auto" # "auto" | "local" | "p2p" | "disk" export_policy: str = "never" # "never" | "auto" | "force" + need_view_data_hash: bool = True pinned_allocation_timeout_ms: int = DEFAULT_PINNED_TIMEOUT_MS # When >0 and the initial retrieval fails, the daemon can wait for a managed # shared-disk location to become ready before retrying disk-only. diff --git a/tensorcast/api/_materialize.py b/tensorcast/api/_materialize.py index 859d28666..7a12844b1 100644 --- a/tensorcast/api/_materialize.py +++ b/tensorcast/api/_materialize.py @@ -5,6 +5,7 @@ import array import contextlib import fcntl +import logging import os import socket import struct @@ -37,6 +38,8 @@ from tensorcast.proto.daemon.v2 import store_daemon_pb2 from tensorcast.types import ServerConfig +logger = logging.getLogger(__name__) + @dataclass(frozen=True) class TensorPayloadDescriptor: @@ -365,6 +368,7 @@ def materialize_artifact_v2( preference=preference_value, source_policy=source_policy, export_policy=export_policy, + need_view_data_hash=bool(opts.need_view_data_hash), target_device_type=target_device_type, lease_mode=lease_mode, timeout_s=effective_timeout_s, diff --git a/tensorcast/api/store/artifact.py b/tensorcast/api/store/artifact.py index 0210f9d2e..df4764d0b 100644 --- a/tensorcast/api/store/artifact.py +++ b/tensorcast/api/store/artifact.py @@ -1457,8 +1457,6 @@ async def tensor_async( view_hash = None if self._view_metadata is not None: view_hash = self._view_metadata.view_data_hash - elif self._view_spec is not None: - view_hash = ViewSpecComposer.hash_view_spec(self._view_spec) batcher = getattr(store, "_batcher", None) if batcher is None: return await loop.run_in_executor( @@ -2268,10 +2266,6 @@ def _update_metadata_from_payload( view_index = canonical_index_from_bytes(view_index_bytes) view_hash = getattr(payload, "view_data_hash", None) subset_names = tuple(entry.name for entry in view_index.entries) - if view_hash is None: - view_hash = ViewSpecComposer.hash_view_spec( - self._view_spec, subset=subset_names - ) resolved_view_id: str | None = None if self._view_spec is not None and not self._view_spec.is_identity: view_proto = self._view_spec.proto @@ -2285,7 +2279,7 @@ def _update_metadata_from_payload( view_cache = ViewMetadataCache( view_id=str(resolved_view_id or ""), view_index_bytes=view_index_bytes, - view_data_hash=str(view_hash), + view_data_hash=(str(view_hash) if view_hash else None), tensor_names=subset_names, nbytes=sum(entry.size_bytes for entry in view_index.entries), selected_index=view_index, diff --git a/tensorcast/api/store/view_composer.py b/tensorcast/api/store/view_composer.py index 07179b94c..a799337e4 100644 --- a/tensorcast/api/store/view_composer.py +++ b/tensorcast/api/store/view_composer.py @@ -37,7 +37,7 @@ class ViewMetadataCache: view_id: str view_index_bytes: bytes - view_data_hash: str + view_data_hash: str | None tensor_names: tuple[str, ...] nbytes: int selected_index: CanonicalIndex @@ -321,7 +321,6 @@ def compose( ) view_index_bytes = bytes(view_payload["view_index_bytes"]) resolved_selected_index = canonical_index_from_bytes(view_index_bytes) - view_hash = self.hash_view_spec(composed_spec, subset=tensor_names) view_id: str | None = None if composed_spec is not None and not composed_spec.is_identity: view_proto = composed_spec.proto @@ -335,7 +334,7 @@ def compose( view_cache = ViewMetadataCache( view_id=view_id or "", view_index_bytes=view_index_bytes, - view_data_hash=view_hash, + view_data_hash=None, tensor_names=tensor_names, nbytes=int(resolved_selected_index.total_size_bytes), selected_index=resolved_selected_index, diff --git a/tensorcast/daemon_ctl.py b/tensorcast/daemon_ctl.py index 822fb9056..fe61896b5 100644 --- a/tensorcast/daemon_ctl.py +++ b/tensorcast/daemon_ctl.py @@ -1230,6 +1230,7 @@ def materialize_by_artifact_id_v2( preference: store_daemon_pb2.SourcePreference | None = None, source_policy: store_daemon_pb2.SourcePolicy | None = None, export_policy: store_daemon_pb2.ExportPolicy | None = None, + need_view_data_hash: bool = True, target_device_type: store_daemon_pb2.DeviceType = store_daemon_pb2.DeviceType.DEVICE_TYPE_GPU, lease_mode: store_daemon_pb2.LeaseMode = store_daemon_pb2.LeaseMode.LEASE_MODE_UNSPECIFIED, timeout_s: float | int | None = None, @@ -1250,6 +1251,7 @@ def materialize_by_artifact_id_v2( preference: store_daemon_pb2.SourcePreference | None = None, source_policy: store_daemon_pb2.SourcePolicy | None = None, export_policy: store_daemon_pb2.ExportPolicy | None = None, + need_view_data_hash: bool = True, target_device_type: store_daemon_pb2.DeviceType = store_daemon_pb2.DeviceType.DEVICE_TYPE_GPU, lease_mode: store_daemon_pb2.LeaseMode = store_daemon_pb2.LeaseMode.LEASE_MODE_UNSPECIFIED, timeout_s: float | int | None = None, @@ -1270,6 +1272,7 @@ def materialize_by_artifact_id_v2( preference: store_daemon_pb2.SourcePreference | None = None, source_policy: store_daemon_pb2.SourcePolicy | None = None, export_policy: store_daemon_pb2.ExportPolicy | None = None, + need_view_data_hash: bool = True, target_device_type: store_daemon_pb2.DeviceType = store_daemon_pb2.DeviceType.DEVICE_TYPE_GPU, lease_mode: store_daemon_pb2.LeaseMode = store_daemon_pb2.LeaseMode.LEASE_MODE_UNSPECIFIED, timeout_s: float | int | None = None, @@ -1288,6 +1291,7 @@ def materialize_by_artifact_id_v2( preference: store_daemon_pb2.SourcePreference | None = None, source_policy: store_daemon_pb2.SourcePolicy | None = None, export_policy: store_daemon_pb2.ExportPolicy | None = None, + need_view_data_hash: bool = True, target_device_type: store_daemon_pb2.DeviceType = store_daemon_pb2.DeviceType.DEVICE_TYPE_GPU, lease_mode: store_daemon_pb2.LeaseMode = store_daemon_pb2.LeaseMode.LEASE_MODE_UNSPECIFIED, timeout_s: float | int | None = None, @@ -1341,6 +1345,8 @@ def materialize_by_artifact_id_v2( request.source_policy.CopyFrom(source_policy) if export_policy is not None: request.export_policy = export_policy + if not need_view_data_hash: + request.need_view_data_hash = False if placement is not None: request.placement = placement try: From c34804f1861c915c69df7f06fe6479ab1806d73f Mon Sep 17 00:00:00 2001 From: i-zhouyuhan Date: Fri, 13 Mar 2026 15:58:07 +0800 Subject: [PATCH 06/10] fix(daemon): 1) use pwrite for memfd-backed CPU export; 2) hint MADV_HUGEPAGE for CPU export --- .../store/replica/unified_memory_authority.cc | 165 ++++++++++++++++-- 1 file changed, 155 insertions(+), 10 deletions(-) diff --git a/core/store/replica/unified_memory_authority.cc b/core/store/replica/unified_memory_authority.cc index 2d3f7d804..c07143e38 100644 --- a/core/store/replica/unified_memory_authority.cc +++ b/core/store/replica/unified_memory_authority.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,8 +17,11 @@ #include #include #include +#include +#include #include "absl/log/log.h" +#include "absl/log/vlog_is_on.h" #include "absl/strings/str_format.h" #include "core/common/const/granularity.h" #include "core/common/system_capabilities.h" @@ -32,6 +36,69 @@ namespace tensorcast::store::replica { using tensorcast::common::SystemCapabilities; +namespace { + +struct ThreadFaultSnapshot { + long minor_faults{0}; + long major_faults{0}; +}; + +struct ResidencySnapshot { + size_t resident_pages{0}; + size_t total_pages{0}; +}; + +std::optional capture_thread_fault_snapshot() { + struct rusage usage{}; + if (::getrusage(RUSAGE_THREAD, &usage) != 0) { + return std::nullopt; + } + return ThreadFaultSnapshot{.minor_faults = usage.ru_minflt, .major_faults = usage.ru_majflt}; +} + +std::optional capture_residency_snapshot(void* addr, size_t len) { + if (addr == nullptr || len == 0) { + return ResidencySnapshot{}; + } + const long page_size = ::sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return std::nullopt; + } + const size_t page_count = (len + static_cast(page_size) - 1) / static_cast(page_size); + std::vector pages(page_count, 0); + if (::mincore(addr, len, pages.data()) != 0) { + return std::nullopt; + } + size_t resident_pages = 0; + for (unsigned char page : pages) { + resident_pages += (page & 1U) != 0U ? 1U : 0U; + } + return ResidencySnapshot{.resident_pages = resident_pages, .total_pages = page_count}; +} + +void maybe_hint_hugepage(void* addr, size_t bytes) { +#ifdef MADV_HUGEPAGE + if (addr == nullptr || bytes == 0) { + return; + } + const auto hugepage_start = std::chrono::steady_clock::now(); + const int hugepage_rc = ::madvise(addr, bytes, MADV_HUGEPAGE); + const int hugepage_errno = hugepage_rc == 0 ? 0 : errno; + const auto hugepage_end = std::chrono::steady_clock::now(); + if (hugepage_rc != 0 && hugepage_errno != EINVAL && hugepage_errno != ENOSYS) { + PLOG(WARNING) << "madvise MADV_HUGEPAGE failed in UMA allocate_region"; + } + VLOG(1) << "uma.cpu_arena.hugepage_hint bytes=" << bytes << " rc=" << hugepage_rc << " errno=" << hugepage_errno + << " duration_us=" + << std::chrono::duration_cast(hugepage_end - hugepage_start).count(); +#else + (void)addr; + (void)bytes; +#endif +} + +} // namespace + UnifiedMemoryAuthority::UnifiedMemoryAuthority(size_t artifact_chunk_bytes) : UnifiedMemoryAuthority(artifact_chunk_bytes, Options{}) {} @@ -1286,6 +1353,7 @@ absl::Status UnifiedMemoryAuthority::write_cpu_span( uint64_t va_offset, const void* src, size_t bytes) { + const auto total_start = std::chrono::steady_clock::now(); if (bytes == 0) { return absl::OkStatus(); } @@ -1297,11 +1365,19 @@ absl::Status UnifiedMemoryAuthority::write_cpu_span( if (it == allocations_.end()) { return absl::NotFoundError(absl::StrFormat("Replica %s not found in unified memory", key.artifact_id)); } + const auto arena_start = std::chrono::steady_clock::now(); auto status = cpu_arena_.write_span(it->second, va_offset, src, bytes); + const auto arena_end = std::chrono::steady_clock::now(); if (!status.ok()) { return status; } + const auto record_start = std::chrono::steady_clock::now(); record_cpu_write_locked_(it->second, va_offset, bytes); + const auto record_end = std::chrono::steady_clock::now(); + VLOG(2) << "uma.write_cpu_span artifact_id=" << key.artifact_id << " offset=" << va_offset << " bytes=" << bytes + << " arena_us=" << std::chrono::duration_cast(arena_end - arena_start).count() + << " record_us=" << std::chrono::duration_cast(record_end - record_start).count() + << " total_us=" << std::chrono::duration_cast(record_end - total_start).count(); return absl::OkStatus(); } @@ -1345,6 +1421,7 @@ absl::Status UnifiedMemoryAuthority::CpuArena::allocate_region(ReplicaAllocation alloc.cpu_region.bytes = bytes; alloc.cpu_region.memfd = fd; alloc.cpu_region.offset_bytes = 0; + maybe_hint_hugepage(addr, bytes); return absl::OkStatus(); } @@ -1356,6 +1433,7 @@ absl::Status UnifiedMemoryAuthority::CpuArena::allocate_region(ReplicaAllocation alloc.cpu_region.bytes = bytes; alloc.cpu_region.memfd = -1; alloc.cpu_region.offset_bytes = 0; + maybe_hint_hugepage(addr, bytes); return absl::OkStatus(); } @@ -1395,12 +1473,16 @@ absl::Status UnifiedMemoryAuthority::CpuArena::write_span( uint64_t va_offset, const void* src, size_t bytes) const { + const auto total_start = std::chrono::steady_clock::now(); + const bool memfd_backed = alloc.cpu_region.memfd >= 0; if (bytes == 0) { return absl::OkStatus(); } + const auto bounds_start = std::chrono::steady_clock::now(); if (auto st = ensure_bounds(alloc, va_offset, bytes); !st.ok()) { return st; } + const auto bounds_end = std::chrono::steady_clock::now(); const long page = sysconf(_SC_PAGESIZE); uint64_t page_aligned_off = (va_offset / page) * page; @@ -1408,26 +1490,89 @@ absl::Status UnifiedMemoryAuthority::CpuArena::write_span( uint64_t page_aligned_end = ((end_off + page - 1) / page) * page; size_t aligned_len = static_cast(page_aligned_end - page_aligned_off); void* aligned_addr = static_cast(alloc.cpu_region.base) + page_aligned_off; - if (::mprotect(aligned_addr, aligned_len, PROT_READ | PROT_WRITE) != 0) { - if (alloc.cpu_region.memfd >= 0) { - return absl::ErrnoToStatus(errno, "UMA write: mprotect failed for memfd-backed CPU arena"); - } - void* mapped = - ::mmap(aligned_addr, aligned_len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); - if (mapped == MAP_FAILED || mapped != aligned_addr) { - return absl::ErrnoToStatus(errno, "UMA write: failed to ensure writable mapping"); + const bool collect_vlog2 = ABSL_VLOG_IS_ON(2); + const auto fault_before = collect_vlog2 ? capture_thread_fault_snapshot() : std::nullopt; + const auto resident_before = + collect_vlog2 && memfd_backed ? capture_residency_snapshot(aligned_addr, aligned_len) : std::nullopt; + const auto protect_start = std::chrono::steady_clock::now(); + if (!memfd_backed) { + if (::mprotect(aligned_addr, aligned_len, PROT_READ | PROT_WRITE) != 0) { + void* mapped = + ::mmap(aligned_addr, aligned_len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (mapped == MAP_FAILED || mapped != aligned_addr) { + return absl::ErrnoToStatus(errno, "UMA write: failed to ensure writable mapping"); + } } } + const auto protect_end = std::chrono::steady_clock::now(); - if (SystemCapabilities::instance().madv_willneed_available()) { + const auto madvise_start = std::chrono::steady_clock::now(); + const bool need_madvise = !memfd_backed; + if (need_madvise && SystemCapabilities::instance().madv_willneed_available()) { int rc = ::madvise(aligned_addr, aligned_len, MADV_WILLNEED); if (rc != 0 && errno != EINVAL) { PLOG(WARNING) << "madvise WILLNEED failed in UMA write"; } } + const auto madvise_end = std::chrono::steady_clock::now(); void* dst = static_cast(alloc.cpu_region.base) + va_offset; - std::memcpy(dst, src, bytes); + const auto copy_start = std::chrono::steady_clock::now(); + if (memfd_backed) { + const char* src_bytes = static_cast(src); + size_t remaining = bytes; + uint64_t written = 0; + while (remaining > 0) { + const ssize_t rc = + ::pwrite(alloc.cpu_region.memfd, src_bytes + written, remaining, static_cast(va_offset + written)); + if (rc < 0) { + if (errno == EINTR) { + continue; + } + return absl::ErrnoToStatus(errno, "UMA write: pwrite to memfd-backed CPU arena failed"); + } + if (rc == 0) { + return absl::InternalError("UMA write: pwrite returned 0 for memfd-backed CPU arena"); + } + written += static_cast(rc); + remaining -= static_cast(rc); + } + } else { + std::memcpy(dst, src, bytes); + } + const auto copy_end = std::chrono::steady_clock::now(); + const auto fault_after = collect_vlog2 ? capture_thread_fault_snapshot() : std::nullopt; + const auto resident_after = + collect_vlog2 && memfd_backed ? capture_residency_snapshot(aligned_addr, aligned_len) : std::nullopt; + const auto copy_us = std::chrono::duration_cast(copy_end - copy_start).count(); + const double copy_gib_per_s = copy_us > 0 + ? (static_cast(bytes) / (1024.0 * 1024.0 * 1024.0)) / (static_cast(copy_us) / 1e6) + : 0.0; + VLOG(2) << "uma.cpu_arena.write_span offset=" << va_offset << " bytes=" << bytes << " aligned_len=" << aligned_len + << " memfd_backed=" << memfd_backed << " copy_method=" << (memfd_backed ? "pwrite" : "memcpy") + << " ensure_bounds_us=" + << std::chrono::duration_cast(bounds_end - bounds_start).count() << " mprotect_us=" + << std::chrono::duration_cast(protect_end - protect_start).count() + << " madvise_us=" + << std::chrono::duration_cast(madvise_end - madvise_start).count() + << " copy_us=" << copy_us << " copy_gib_per_s=" << absl::StrFormat("%.3f", copy_gib_per_s) + << " fault_minor_delta=" + << ((fault_before.has_value() && fault_after.has_value()) + ? (fault_after->minor_faults - fault_before->minor_faults) + : -1) + << " fault_major_delta=" + << ((fault_before.has_value() && fault_after.has_value()) + ? (fault_after->major_faults - fault_before->major_faults) + : -1) + << " resident_pages_before=" + << (resident_before.has_value() ? static_cast(resident_before->resident_pages) : -1) + << " resident_pages_after=" + << (resident_after.has_value() ? static_cast(resident_after->resident_pages) : -1) + << " total_pages=" + << (resident_after.has_value() + ? static_cast(resident_after->total_pages) + : (resident_before.has_value() ? static_cast(resident_before->total_pages) : -1)) + << " total_us=" << std::chrono::duration_cast(copy_end - total_start).count(); return absl::OkStatus(); } From 8945db6de6f57852aac32a4e142480e931633bbf Mon Sep 17 00:00:00 2001 From: i-zhouyuhan Date: Thu, 19 Mar 2026 13:47:40 +0800 Subject: [PATCH 07/10] docs(daemon): add source-side-remote-view-transport design --- docs/architecture/p2p-transfer-strategies.md | 14 + .../0086-source-side-remote-view-transport.md | 936 ++++++++++++++++++ 2 files changed, 950 insertions(+) create mode 100644 docs/designs/0086-source-side-remote-view-transport.md diff --git a/docs/architecture/p2p-transfer-strategies.md b/docs/architecture/p2p-transfer-strategies.md index 2c8e2f6c0..bfd56a28c 100644 --- a/docs/architecture/p2p-transfer-strategies.md +++ b/docs/architecture/p2p-transfer-strategies.md @@ -11,6 +11,7 @@ This document explains how P2P transfers work in Global Store mode. It is code-d Related docs: - `docs/architecture/artifact-views-and-retrieval.md` - `docs/architecture/view-replicas-and-assembly.md` +- `docs/designs/0086-source-side-remote-view-transport.md` ## Scope and terminology @@ -212,6 +213,19 @@ Notes: - `GlobalStoreRegistrationPublisher` calls `GlobalStoreClient::update_artifact_view_state` to persist view metadata; failures are logged and treated as best-effort when the server does not support the RPC. - For view materialization via P2P, `MetadataGateway::register_replica` also calls `record_view_residency` (currently `Unimplemented` in the client), so view residency updates are best-effort until the server RPC lands. +## Proposed: Source-side Remote View Transport + +The forward-looking design for remote TP-slice transport now lives in `docs/designs/0086-source-side-remote-view-transport.md`. + +That design covers: + +- first-class view residency in Global Store; +- lookup-or-derive routing for `request_view_transport(view)`; +- source-side derived-view export from canonical local replicas; +- compatibility, verification, and observability requirements for phased rollout. + +This architecture document remains focused on the current code path and its existing transport behavior. + ## Observability - Global Store metrics: `inc_transport_request`, `observe_transport_wait`, `inc_active_transports`, `dec_active_transports`. diff --git a/docs/designs/0086-source-side-remote-view-transport.md b/docs/designs/0086-source-side-remote-view-transport.md new file mode 100644 index 000000000..d1f9d3e84 --- /dev/null +++ b/docs/designs/0086-source-side-remote-view-transport.md @@ -0,0 +1,936 @@ +--- +slug: source-side-remote-view-transport +title: Source-Side Remote View Transport +areas: ["global_store", "core", "daemon", "proto", "tests", "docs"] +status: proposed +created: 2026-03-15 +last_updated: 2026-03-18 +related_code: + - docs/architecture/p2p-transfer-strategies.md + - docs/plans/0086-source-side-remote-view-transport.md + - proto/tensorcast/global_store/v1/global_store.proto + - tensorcast/global_store/rpc/transport_rpc_handler.py + - tensorcast/global_store/services/transport_service.py + - tensorcast/global_store/repositories/replica_repository.py + - core/store/components/global_store_client.h + - core/store/components/global_store_client.cc + - core/store/materialization/control/materialize_orchestrator.cc + - core/store/runtime/metadata/metadata_gateway.cc +links: + plan: ../plans/0086-source-side-remote-view-transport.md + related: + - ../architecture/p2p-transfer-strategies.md + - ../architecture/artifact-views-and-retrieval.md + - ../architecture/view-replicas-and-assembly.md +--- + +# Summary + +Today `request_view_transport` only routes to an already-known remote `view_id` source. When the Global Store cannot route that `view_id`, the requester falls back to canonical transport and reconstructs the requested view locally. + +That preserves correctness, but it is the wrong performance shape for TP-sliced model loading: + +- the destination daemon reads canonical byte-space instead of dense view bytes; +- the destination daemon pays read amplification plus local strided repack/pack cost; +- the same reconstruction work is repeated on every destination daemon even when the source daemon already has the canonical replica in local DRAM/VRAM. + +This design proposes a source-side remote view transport model whose end state is: + +- daemon B asks for `artifact_id + view_id`; +- daemon A prepares the requested view byte-space locally from its canonical replica; +- daemon B receives already-reconstructed dense view bytes from daemon A and does not fall back to canonical transport; +- source-side derived views behave as daemon-owned ephemeral cache entries: reusable across repeated fetches for a bounded TTL, but pressure-evictable and never durable by default. + +# Goals + +- Preserve existing TensorCast view semantics: `view_id` remains the stable identity of a byte-space derived from a canonical artifact plus view spec. +- Make view byte-space a first-class transport target, not only a first-class local materialization result. +- Eliminate destination-side canonical reconstruction when a source daemon can derive the requested view locally. +- Ensure verification is performed against view byte-space metadata when available, and never by incorrectly reusing canonical verification metadata for a view. +- Preserve mixed-version compatibility by keeping canonical fallback available until all required control-plane pieces are implemented. + +# Non-Goals + +- Changing how `view_id` is computed. +- Publishing every derived view as a durable artifact by default. +- Routing artifact payload bytes through the Global Store. +- Removing canonical fallback before view-aware routing is fully implemented and rolled out. + +# Current Behavior and Gap + +The current pipeline already carries enough view identity to express the desired route: + +- `MaterializeOrchestrator` prefers `request_view_transport` when `view_id` is present. +- `MetadataGateway` attempts to record view residency after successful ingestion. +- registration flows already compute and publish `view_data_hash` and partial canonical coverage for registered views. + +However, the control-plane implementation is incomplete: + +- `record_view_residency` is still `Unimplemented` in the client; +- `request_view_transport` only works when the Global Store can already route an existing view source; +- when that lookup misses, the requester falls back to canonical routing and performs view reconstruction locally. + +The design closes this gap in two phases. + +# Phase 1: First-Class Routable View Residency + +Phase 1 makes existing view replicas routable. + +## Control-Plane Changes + +- Implement the Global Store view residency / view state RPCs so daemon-published view metadata becomes queryable instead of best-effort. +- Persist enough metadata to route a view source: + - `artifact_id` + - `view_id` + - `view_size_bytes` + - `view_data_hash` when available + - source worker / node identity + - transport-addressable residency information and device placement +- Keep view metadata immutable with the same conflict rules already used by view registration. `view_id` identity and `view_data_hash` must stay stable. + +## Routing Behavior + +- `request_view_transport(view)` first looks for an already-resident view replica. +- If found, the response routes directly to that view source. +- If not found, the current canonical fallback behavior is preserved. + +## Why Phase 1 Matters + +This phase does not yet create view sources on demand, but it makes the existing view byte-space model routable and removes redundant fallback when a matching view was already materialized or registered elsewhere. + +# Phase 2: Source-Side Derived View Transport + +Phase 2 extends `request_view_transport(view)` from a pure lookup into a lookup-or-derive operation. + +## Target Semantics + +When daemon B requests `artifact_id + view_id`: + +1. If the Global Store already knows a resident view source, route to it. +2. Otherwise, choose a canonical source daemon A that: + - has a routable canonical replica; + - can derive the requested `view_id` from local canonical bytes; + - is eligible under the usual load, heartbeat, and transport guardrails. +3. Ask daemon A to expose the requested view byte-space as the transport source. +4. Return a transport session whose payload is the dense view byte-space, not the canonical byte-space. + +The key semantic point is that the selected source still serves the requested byte-space identified by `view_id`; it is not a canonical transport disguised as a view request. + +## Source Daemon Behavior + +When daemon A is selected as a derivation source: + +- it resolves the requested `view_id` against canonical metadata and reconstructs the view locally from its canonical replica; +- the reconstruction should reuse the existing view dataplane primitives such as `ViewPlanner`, `ViewPlanSource`, and `ByteRangeMappedSource` instead of introducing a second view implementation just for transport; +- the derived source is published as a daemon-owned resident-view cache entry, not as a durable artifact and not as a transport-scoped one-shot object. + +The required semantics for this derived source are: + +- it is ephemeral, but not immediate-release; +- it is keyed by `(artifact_id, view_id, device)` and lives on daemon A; +- it is reusable across repeated fetches for the same byte-space; +- it is retained under a bounded TTL that refreshes on real use; +- it is pressure-evictable and must never compete with canonical pinned replicas as an unbounded peer; +- it is retired by daemon A, not by daemon B. + +This keeps the feature aligned with TensorCast semantics: a view can exist as a stable byte-space identity without requiring every transport-time derivation to become durable global state, while still allowing later consumers to reuse the first prepared dense view. + +## Destination Daemon Behavior + +When daemon B receives a routed view transport: + +- it loads dense view bytes from the source transport directly; +- it does not reconstruct the TP slice from canonical byte-space locally; +- it materializes the resulting local replica and exports it to the requester exactly as today. + +When daemon B repeats the same request while daemon A still holds a live derived-view cache entry: + +- daemon B should receive a resident-view route immediately from the Global Store; +- daemon A should reuse the existing derived view instead of reconstructing it again; +- the data-plane use should refresh daemon A's TTL for that cache entry. + +In other words, destination-side `canonical -> view` remapping is replaced with source-side `canonical -> view` derivation. + +## Derived View Export Lifecycle Semantics + +The runtime behavior above requires an explicit lifecycle model. The important point is that source-side derived view exports are neither: + +- durable published replicas; nor +- single-use transport scratch objects that disappear as soon as one consumer finishes. + +They are daemon-owned ephemeral resident-view cache entries. + +### Ownership + +The lifecycle owner is daemon A, not daemon B and not the Global Store. + +- daemon B may trigger prepare and consume the resulting transport; +- the Global Store may route or stop routing the resident view; +- daemon A owns the local replica memory, the export keys, the route publication state, TTL, and eviction decisions. + +This separation is required because only daemon A has authoritative visibility into: + +- whether the local derived view still exists; +- whether it is currently serving in-flight transports; +- how much local stable DRAM budget remains; +- whether an entry should stay hot, become idle, or be retired. + +### Logical State Model + +Each source-side derived view export should have a daemon-local record keyed by `(artifact_id, view_id, device)`. + +The minimum state is: + +- `state`: `preparing | live | draining | retired` +- `replica_uuid` / local `ReplicaKey` +- resident-view route identity as advertised to the Global Store +- `size_bytes` +- `expiry` +- `last_access` +- `active_fetches` +- export registration metadata needed for orderly unregister / release + +The intended state machine is: + +1. `preparing` + - daemon A is materializing the dense view locally + - not yet reusable + - not evictable +2. `live` + - resident-view route is published + - `active_fetches > 0` means transport-pinned + - `active_fetches == 0` means idle-but-reusable +3. `draining` + - route has been withdrawn + - no new fetches may attach + - daemon waits for `active_fetches` to drop to zero +4. `retired` + - export unregistered + - route withdrawn + - local replica memory released + +### TTL Retention + +Derived view exports should use sliding TTL retention. + +- on first successful prepare, daemon A assigns `expiry = now + ttl`; +- when a later fetch actually uses the derived view data-plane, daemon A refreshes `expiry`; +- repeated control-plane probes must not refresh TTL; +- failed or abandoned prepare attempts must not create a long-lived TTL entry. + +The refresh point should correspond to real source-side use, not just route visibility: + +- preferred: refresh on the first actual read / transport use from the export; +- acceptable fallback: refresh when daemon A commits to serving the resident-view transport; +- not acceptable: refresh on `RequestReplicaTransport(view)` polling alone. + +This is what preserves the useful reuse behavior seen in relay benchmarks: + +- `trial 1` pays the prepare cost; +- `trial 2` and `trial 3` can reuse the same derived view if they arrive within TTL; +- unused entries still age out naturally. + +### Pressure Eviction + +TTL alone is not sufficient. Update-style workloads generate different `artifact_id` values, so old derived views must not remain resident until TTL expiry if they block admission of newer ones. + +Therefore source-side derived views must also be pressure-evictable: + +- expired idle entries are evicted first; +- then non-expired idle entries in oldest-`last_access` order; +- `preparing` entries are never evicted; +- `live` entries with `active_fetches > 0` are never evicted. + +Admission of a new derived view should follow this order: + +1. reuse an existing live entry for the exact `(artifact_id, view_id, device)` if present; +2. if not present, try to admit a new entry; +3. if admission would exceed the source daemon's cache budget, evict eligible idle derived views; +4. if admission still cannot succeed, stop upgrading and fall back to canonical transport rather than overcommitting daemon A. + +This is a semantic requirement, not only a performance optimization: + +- canonical model versions may legitimately be pinned in stable DRAM; +- source-side derived views must be lower-priority cache entries; +- otherwise repeated multi-version updates can exhaust daemon A memory and break later fetches. + +The design permits two equivalent budgeting implementations: + +- a dedicated `derived_view_export_cache_bytes` budget; or +- a lower-priority class inside the existing stable tier, provided eviction can target derived views before canonical pinned replicas. + +## Phase 7 Hardening Notes + +The first production failure mode after Phase 6 was not a byte-space correctness bug. It was a lifecycle/admission bug: `prepare` admission added an extra throttle on top of the real derived-view budget reservation and could serialize TP-slice prepares even when the derived-view cache still had enough budget for all of them. In practice that caused one TP rank to miss the upgrade wait budget and fall back to canonical transport on the destination daemon. + +The hardening rule is therefore: + +- prepare-time throttling must never be stricter than the actual derived-view admission budget; +- if admission truly cannot proceed after eviction, the source-side upgrade should fail explicitly and the requester should fall back to canonical transport with an observable reason; +- canonical fallback is a compatibility and pressure backstop, not a side effect of an arbitrary local serialization gate. + +### Source-Side Fetch Lifecycle + +For ephemeral derived views, source-daemon lifecycle correctness depends on tracking actual source-side data-plane use, not only route publication. + +The intended data-plane contract is: + +1. `LockTransportChunks(view)` is the first source-daemon operation that proves the routed view is actually being used. +2. Source daemon A increments `active_fetches` for the `(artifact_id, view_id, device)` entry at lock time. +3. TTL refresh happens at the same point, because this is a real data-plane use rather than a control-plane probe. +4. `UnlockTransportChunks(view)` decrements `active_fetches`. +5. If a lock token expires without unlock, the lock TTL sweep must also decrement `active_fetches`. + +This gives the manager a local notion of "in flight on daemon A" that is narrower and more precise than Global Store transport-session bookkeeping. + +Current implementation note: + +- the relay path exercised by `load_weight_remote` and `update_weight_remote` performs direct MTCP/RDMA reads from exported memory keys and does not route those reads through `LockTransportChunks`; +- therefore `LockTransportChunks`-based accounting is not sufficient to implement Phase 7 `active_fetches` semantics for remote relay by itself; +- Phase 7 uses explicit daemon-to-daemon RPCs on the relay hot path: + - daemon B sends `BeginReplicaFetch(transport_id, artifact_id, view_id, device)` to daemon A immediately before `ingest_from_p2p`; + - daemon A resolves the source-side derived-view entry keyed by `(artifact_id, view_id, device)`, refreshes TTL, increments `active_fetches`, and records the `transport_id` idempotently; + - daemon B sends `EndReplicaFetch(transport_id, reason)` immediately after `ingest_from_p2p` returns, regardless of success or failure; + - daemon A decrements `active_fetches` by `transport_id`, making drain/retire wait on real data-plane lifetime rather than on control-plane route visibility. + +### Ordered Retirement + +Retirement of a source-side derived view is a two-stage drain: + +1. mark the resident-view route unavailable in the Global Store and wait for remote transport-session drain; +2. close local attaches on daemon A, wait for `active_fetches == 0`, then unregister the source route and release local backing memory. + +This ordering is intentional: + +- route withdrawal happens before local release, so new route selection cannot observe a dead endpoint; +- already-issued transports may still finish during the remote-drain window; +- after remote drain is complete, daemon A stops accepting new attaches for that entry and waits for local `lock/unlock` users to leave; +- only then does daemon A unregister the replica and free the local dense-view backing. + +If retirement fails after route withdrawal has already happened, the entry must stay daemon-owned and draining, then retry retirement later. It must not be revived as a normal reusable entry, because the route is already withdrawn. + +### Cleanup / Retirement Ordering + +Retirement must be source-daemon-owned and ordered. + +Correct order: + +1. mark the entry `draining`; +2. withdraw the resident-view route from the Global Store; +3. wait for in-flight transports to drain (`active_fetches == 0`); + +# Phase 8: Deadline and Wait-Budget Semantics for Source-Side Upgrade + +The relay/update regressions observed after Phase 7 were not only lifecycle issues. They also exposed a second, orthogonal problem: the destination daemon and source daemon do not currently use a principled waiting budget for `derived_view_from_canonical -> resident_view` upgrade. + +In the failing `update_weight_remote` runs: + +- the caller gave TensorCast an end-to-end materialization deadline of about 180s; +- the destination daemon still derived its internal materialization / transport wait budget from `pinned_allocation_timeout_ms`, which defaults to 30s; +- daemon B therefore waited only about 30s for daemon A to finish preparing the source-side dense view; +- when that 30s budget expired, daemon B fell back to canonical transport; +- canonical transport then consumed the rest of the caller deadline and the overall request failed anyway. + +This is the wrong semantic layering. `pinned_allocation_timeout_ms` is a local resource-wait knob. It should not decide how long daemon B is willing to wait for daemon A to prepare a routed remote view. + +## Design Goal + +Use the caller deadline as the authoritative budget for the whole materialization request, and derive source-side upgrade waiting from that budget instead of from unrelated pinned-allocation settings. + +The intended hierarchy is: + +1. caller deadline: + - authoritative end-to-end SLA for one materialization request; + - comes from the client call context / gRPC deadline; +2. source-side upgrade wait budget: + - bounded by the remaining caller deadline; + - used when daemon B asks daemon A to prepare and publish a source-side dense view; +3. pinned-allocation timeout: + - local timeout for pinned buffer / staging allocation only; + - must not be reused as the source-side upgrade deadline. + +## Why the Current Behavior Is Wrong + +The current implementation couples unrelated concerns: + +- `req.pinned_allocation_timeout_ms` is used to derive `request_budget`; +- `request_budget` is then copied into `transport_wait_timeout`; +- `transport_wait_timeout` is used by the orchestrator as the wait budget for source-side export readiness. + +This means a local staging timeout silently becomes a distributed control/data-plane timeout. For large remote updates, that is too strict and not workload-aware. + +The result is a pathological fallback pattern: + +- the system waits too little on the fast path; +- then falls back to the slower path; +- then still fails under the original caller deadline. + +That fallback does not improve correctness or latency. It only obscures the real cause. + +## Proposed Semantics + +### 1. Authoritative Request Budget + +For daemon-side materialization: + +- if the incoming gRPC request has a deadline, that remaining deadline is the authoritative `request_budget`; +- if no deadline is provided, the daemon may continue to use a conservative internal default budget for the request as a whole. + +`request_budget` therefore means: + +- "how long this materialization request is allowed to continue overall", + +not: + +- "how long pinned allocation is allowed to wait". + +### 2. Source-Side Upgrade Wait Budget + +When daemon B attempts `derived_view_from_canonical -> resident_view` upgrade: + +- the wait budget for source-side prepare and readiness is derived from the remaining authoritative `request_budget`; +- the upgrade may consume most of the remaining request budget if needed; +- the wait budget shrinks naturally as the caller deadline is consumed by earlier work. + +This is the intended end-to-end behavior: + +- if the caller is willing to wait 180s, daemon B may use that budget for the source-side upgrade path; +- if only 20s remain, daemon B may only use the remaining 20s. + +No extra 30s local cap should be injected unless a future API explicitly asks for one. + +### 3. Pinned Allocation Timeout Stays Local + +`pinned_allocation_timeout_ms` remains meaningful, but only for local resource waits such as: + +- pinned host buffer acquisition; +- local staging / allocation guardrails tied to pinned resources. + +It should no longer be copied into: + +- daemon-side `request_budget`; +- daemon-side `transport_wait_timeout`; +- source-side export readiness wait budget. + +### 4. Fallback Policy by Failure Class + +Canonical fallback remains necessary, but not for every timeout. + +Canonical fallback is still correct when the source-side upgrade cannot safely proceed because of: + +- capability mismatch; +- route lookup unavailability; +- source-side admission failure; +- pressure / eviction failure; +- explicit lifecycle race such as a drained or invalidated source entry. + +Canonical fallback is not the right response when the upgrade simply consumes the available request deadline. + +Therefore: + +- compatibility / admission / resource failures may fall back to canonical transport; +- deadline exhaustion while waiting for source-side export readiness should surface as timeout to the caller, not as a late canonical fallback. + +This preserves the role of canonical transport as a compatibility and resource backstop, while avoiding "fallback to something slower after the fast path already used up the deadline". + +## Minimal-Interface Implementation Strategy + +The first implementation should keep interface changes minimal: + +- no required public API change in SGLang; +- no immediate proto change; +- no new user-visible config required for correctness. + +The internal change is: + +1. stop deriving daemon-side `request_budget` from `pinned_allocation_timeout_ms`; +2. derive `request_budget` from the caller deadline; +3. derive orchestrator `transport_wait_timeout` from that request budget; +4. keep `pinned_allocation_timeout_ms` only on the local allocation path; +5. change source-side upgrade timeout handling so deadline exhaustion propagates instead of triggering late canonical fallback. + +If future workloads need more tuning, an additive API can later introduce a dedicated explicit knob such as: + +- `request_budget_ms`; or +- `source_prepare_wait_timeout_ms`. + +But that is not required for the first fix. The current bug is semantic miswiring, not missing API surface. + +## Observability Requirements + +Budget-related failures must be diagnosable from logs. The orchestrator / daemon should emit enough information to distinguish: + +- caller request budget; +- remaining gRPC deadline at request entry; +- pinned allocation timeout; +- transport wait timeout; +- source-side prepare wait budget; +- fallback reason versus terminal timeout reason. + +Without this, large-model remote update/load regressions are difficult to attribute correctly. +4. unregister export keys / transport-visible state; +5. release local replica memory; +6. delete the daemon-local entry. + +## Relay Control/Data-Plane Sequence + +The remote relay path now has three distinct actors: + +- daemon A: source daemon that owns canonical residency and any ephemeral derived-view export; +- Global Store: route selector and transport-session coordinator; +- daemon B: destination daemon that receives the view transport and materializes locally. + +The steady-state sequence for `request_view_transport(artifact_id, view_id)` is: + +```mermaid +sequenceDiagram + participant B as daemon B + participant GS as Global Store + participant A as daemon A + + B->>GS: request_view_transport(artifact_id, view_id) + GS->>A: find resident view route or derive-on-demand + A-->>GS: prepare / publish resident view + GS-->>B: TransportSession(route=resident_view, transport_id) + + B->>A: BeginReplicaFetch(transport_id, artifact_id, view_id, device) + A-->>B: managed=true + Note over A: refresh TTL\nactive_fetches++\nbind transport_id + + B-)A: ingest_from_p2p / read_tensor (MTCP-RDMA) + + B->>A: EndReplicaFetch(transport_id, reason) + A-->>B: managed=true + Note over A: active_fetches--\nrelease transport_id + + B->>GS: complete_replica_transport(outcome) +``` + +Important properties: + +- TTL refresh is tied to `BeginReplicaFetch`, not to `request_view_transport` lookup. +- `active_fetches` is keyed by `transport_id`, so duplicate begin/end RPCs are idempotent and drain waits on real in-flight relay fetches. +- `complete_replica_transport` remains a Global Store concern, but it is no longer the only signal used for source-side lifecycle safety. + +## When Canonical Fallback Is Still Expected + +Canonical fallback remains valid when: + +- the Global Store cannot route a resident view and source-side derive-on-demand admission fails under real budget pressure; +- the source or destination daemon does not support view-aware routing or source-side derive-on-demand; +- the source daemon cannot publish a routable resident view in time for the request budget; +- the relay path is forced onto the canonical route for compatibility rollout or explicit policy. + +It is no longer acceptable to hit canonical fallback merely because the source daemon lacks fetch-lifecycle visibility for a routed resident view. Phase 7 closes that gap with `BeginReplicaFetch` / `EndReplicaFetch`. + +Incorrect order: + +- releasing local export state before route withdrawal; +- dropping the transport endpoint while the Global Store can still route to it; +- relying on daemon B's one-shot `ReleaseReplica(ticket)` call as the normal steady-state cleanup mechanism. + +This ordering exists to prevent exactly the stale-route failure mode where: + +- the Global Store still returns a resident-view source; +- daemon B attempts to connect; +- daemon A has already dropped the underlying transport/export state. + +### Reuse versus Immediate Release + +The intended steady-state path after a successful transfer is: + +- transfer completes; +- daemon A decrements `active_fetches`; +- if it reaches zero, the entry becomes idle and remains reusable until TTL expiry or eviction. + +The intended path is not: + +- transfer completes; +- daemon B immediately destroys the prepared export; +- the next consumer is forced to reconstruct the same dense view again. + +Immediate post-transfer release defeats the purpose of source-side reuse and is incompatible with the observed benchmark behavior where later relay trials intentionally benefit from a previously prepared view. + +### Failure and Abort Semantics + +`ReleaseReplica(ticket)` still has a role, but only as an exceptional cleanup path: + +- to discard an unfinished source-side prepare before a resident-view route becomes live; +- to clean up a failed prepare attempt; +- to handle abort / cancellation before the derived view becomes a reusable resident entry. + +Once daemon A has published a resident-view route, normal lifecycle ownership transfers to daemon A's TTL / eviction machinery. Cleanup must no longer depend on daemon B remembering to call `ReleaseReplica(...)`. + +## Current Runtime Sequence + +The current implementation already has a concrete source-side remote view transport flow. The key point is that daemon B still talks to the Global Store for source selection, but daemon B talks directly to daemon A to force source-side view preparation when the Global Store only has a canonical route. + +The missing piece is lifecycle semantics after that prepare succeeds: + +- today a successful source-side prepare can publish resident-view metadata; +- but a transport-scoped `ReleaseReplica(ticket)` model is not sufficient to express bounded reuse plus safe retirement; +- the desired design below upgrades this into a daemon-owned TTL cache with ordered route withdrawal and pressure eviction. + +### Sequence Diagrams + +#### Resident-View Fast Path + +```mermaid +sequenceDiagram + participant Req as Requester on daemon B + participant B as daemon B + participant GS as Global Store + participant A as daemon A + + Req->>B: materialize(artifact_id, view_id) + B->>GS: RequestReplicaTransport(view) + GS-->>B: resident_view transport session + B->>A: BeginReplicaFetch(transport_id,\nartifact_id, view_id, device) + A-->>B: managed=true + Note over A: refresh TTL\nactive_fetches++ + B-)A: ingest transport payload + Note over A,B: payload is already dense view byte-space + B->>A: EndReplicaFetch(transport_id, reason) + A-->>B: managed=true + Note over A: active_fetches--\nentry returns to idle TTL cache + B->>GS: CompleteReplicaTransport(success) + B-->>Req: local replica ready +``` + +#### Source-Side Upgrade Path + +```mermaid +sequenceDiagram + participant Req as Requester on daemon B + participant B as daemon B + participant GS as Global Store + participant A as daemon A + + Req->>B: materialize(artifact_id, view_id) + B->>GS: RequestReplicaTransport(view) [short probe] + + alt resident view already routable + GS-->>B: resident_view transport session + B->>A: BeginReplicaFetch(transport_id,\nartifact_id, view_id, device) + A-->>B: managed=true + Note over A: refresh TTL\nactive_fetches++ + B-)A: ingest view payload + B->>A: EndReplicaFetch(transport_id, reason) + A-->>B: managed=true + B->>GS: CompleteReplicaTransport(success) + B-->>Req: local replica ready + else resident view not yet routable + GS-->>B: miss / timeout / unimplemented + B->>GS: RequestReplicaTransport(canonical) + GS-->>B: canonical source = daemon A + + Note over B,A: source-side upgrade begins + B->>A: MaterializeReplica(artifact_id, view_id, view_spec,\nwait_for_completion=false, export_policy=FORCE,\nlease_mode=NO_LEASE) + A-->>B: allocated ticket + B->>A: WaitReplicaStatus(ticket) + A-->>B: SUCCESS + + Note over A,GS: daemon A publishes resident view route + A->>GS: register resident view metadata + + loop settle / repoll + B->>GS: RequestReplicaTransport(view) + alt resident view route visible + GS-->>B: resident_view transport session + else not visible yet + GS-->>B: non-resident / retryable miss + end + end + + B->>GS: CompleteReplicaTransport(cancelled,\n\"replaced_by_resident_view_transport\") + B->>A: BeginReplicaFetch(transport_id,\nartifact_id, view_id, device) + A-->>B: managed=true + Note over A: refresh TTL\nactive_fetches++ + B-)A: ingest dense view payload + B->>A: EndReplicaFetch(transport_id, reason) + A-->>B: managed=true + B->>GS: CompleteReplicaTransport(success) + A->>A: entry returns to idle TTL cache + B-->>Req: local replica ready + end +``` + +#### TTL Expiry / Pressure Eviction + +```mermaid +sequenceDiagram + participant A as daemon A + participant GS as Global Store + + alt TTL expires on idle entry + A->>A: mark entry draining + A->>GS: withdraw resident view route + A->>A: wait for active_fetches == 0 + A->>A: unregister export keys + A->>A: release local replica memory + A->>A: delete cache entry + else new derived view needs space + A->>A: choose oldest eligible idle derived view + A->>GS: withdraw resident view route + A->>A: wait for active_fetches == 0 + A->>A: unregister export keys + A->>A: release memory + A->>A: admit new derived view + end +``` + +#### Canonical Fallback Boundary + +```mermaid +sequenceDiagram + participant B as daemon B + participant GS as Global Store + participant A as daemon A + + B->>GS: RequestReplicaTransport(view) + GS-->>B: no resident view route in probe window + B->>GS: RequestReplicaTransport(canonical) + GS-->>B: canonical route to daemon A + B->>A: try source-side MaterializeReplica + WaitReplicaStatus + + alt source-side upgrade succeeds and resident route appears + A->>GS: resident view registered + B->>GS: RequestReplicaTransport(view) + GS-->>B: resident_view + Note over B: no canonical fallback on winning path + else source-side upgrade cannot complete correctly + Note over B: canonical fallback remains valid + end +``` + +### Actors + +- daemon B: the destination daemon serving the local materialize request +- daemon A: the remote source daemon that already has the canonical replica +- Global Store: control plane for route selection and transport session assignment + +### Steady-State Resident-View Fast Path + +When the requested `view_id` is already resident and routable on daemon A: + +1. daemon B calls `RequestReplicaTransport(view)` through `GlobalStoreClient::request_view_transport(...)`. +2. The Global Store returns a transport session with route kind `resident_view`. +3. Immediately before the relay read starts, daemon B calls daemon A `BeginReplicaFetch(...)`. +4. daemon A refreshes TTL, increments `active_fetches`, and binds the `transport_id` idempotently. +5. daemon B ingests the transport payload as view byte-space directly. +6. After ingest finishes, daemon B calls daemon A `EndReplicaFetch(...)`. +7. daemon A decrements `active_fetches`. +8. daemon B calls `CompleteReplicaTransport(success)`. +9. daemon B registers its local replica as usual. + +In this path, daemon B never touches canonical byte-space and never performs local `canonical -> view` reconstruction. + +### Current Source-Side Upgrade Path + +The more interesting path is when daemon B wants `artifact_id + view_id`, but the Global Store cannot immediately route a resident view. + +1. daemon B starts with a short `RequestReplicaTransport(view)` probe. + - This is intentionally short-lived. + - It is only trying to discover whether a resident view route already exists. +2. If the probe does not return a resident view route quickly enough, daemon B asks the Global Store for canonical transport by calling `RequestReplicaTransport(canonical)`. +3. The Global Store selects daemon A as the canonical source and returns a remote transport session. +4. If the returned route is eligible for source-side upgrade, daemon B directly calls daemon A over daemon RPC: + - `MaterializeReplica(...)` + - selection includes `artifact_id`, `view_id`, and `view_spec` + - `wait_for_completion=false` + - `lease_mode=NO_LEASE` + - `export_policy=FORCE` + - `need_view_data_hash` follows the original materialize request +5. daemon A prepares an ephemeral derived-view export locally from its canonical replica. +6. daemon B calls daemon A `WaitReplicaStatus(...)` until that export is ready, or until the retry / deadline budget is exhausted. +7. Once daemon A has published the derived view, daemon A registers that resident view route back to the Global Store. +8. daemon B then polls the Global Store again with `RequestReplicaTransport(view)` until the Global Store returns route kind `resident_view`. +9. daemon B cancels the superseded canonical transport session with `CompleteReplicaTransport(cancelled, "replaced_by_resident_view_transport")`. +10. Immediately before the winning relay read starts, daemon B calls daemon A `BeginReplicaFetch(...)`. +11. daemon A refreshes TTL, increments `active_fetches`, and records the `transport_id` idempotently. +12. daemon B ingests the resident-view transport payload from daemon A. +13. After ingest finishes, daemon B calls daemon A `EndReplicaFetch(...)`. +14. daemon A decrements `active_fetches`. +15. daemon B calls `CompleteReplicaTransport(success)` for the winning view transport session. +16. If no fetch is active, daemon A keeps the derived view as an idle TTL-scoped cache entry. +17. A background TTL / eviction path later retires the entry by withdrawing the resident-view route first, then releasing local export state and memory. + +This is the key behavior that avoids destination-side canonical reconstruction: daemon B uses the canonical route only as a bootstrap to identify a suitable source daemon, then upgrades the transfer into a true resident-view transport before bytes move into daemon B. + +### Why the Global Store Is Still in the Loop + +Even during source-side upgrade, the Global Store remains the authority for route selection and route visibility: + +- daemon B does not pick arbitrary source daemons on its own; it starts from a Global Store-selected canonical source; +- daemon A does not send bytes directly as an out-of-band blind push; it publishes a resident view route that the Global Store can hand back to daemon B; +- daemon B waits for the Global Store to expose that resident view route before switching transports. + +This keeps the feature aligned with TensorCast semantics: + +- view identity still lives in the control plane as `artifact_id + view_id`; +- the eventual transport session is explicitly a view transport; +- transport completion, cancellation, and observability still flow through the existing transport session lifecycle. + +## Current RPC / Control-Plane Breakdown + +The following calls are involved in the current source-side remote view transport path. + +### daemon B -> Global Store + +- `RequestReplicaTransport(view)` + - probe for an already-routable resident view +- `RequestReplicaTransport(canonical)` + - bootstrap route selection when the resident-view probe misses +- `RequestReplicaTransport(view)` again + - poll for the resident-view route after daemon A has prepared the view +- `CompleteReplicaTransport(...)` + - cancel superseded canonical routes + - mark the winning transport as success or failure + +### daemon B -> daemon A + +- `MaterializeReplica(...)` + - ask daemon A to prepare an ephemeral derived view export +- `WaitReplicaStatus(...)` + - wait until daemon A reports the export ready +- `BeginReplicaFetch(...)` + - relay hot-path lifecycle hook for source-side data-plane use + - refreshes TTL on real use and increments `active_fetches` +- `EndReplicaFetch(...)` + - relay completion hook + - decrements `active_fetches` on success, failure, or abort +- `ReleaseReplica(...)` + - abort / failure cleanup before the prepared export becomes a reusable resident-view cache entry + - not the normal steady-state cleanup mechanism after a successful transfer + +### daemon A -> Global Store + +- register resident view metadata once the derived export becomes visible + - this is what allows later `RequestReplicaTransport(view)` calls from daemon B to return route kind `resident_view` +- withdraw resident view metadata before local retirement + - this is required before daemon A unregisters export keys or drops local memory + +### Byte Movement + +In the upgraded path, the wire payload that daemon B finally ingests is the dense view byte-space prepared by daemon A. + +That means: + +- daemon A pays the local `canonical -> view` derivation cost once; +- daemon B receives final view bytes instead of amplified canonical byte ranges; +- daemon B does not run destination-side canonical remap / strided pack for the winning route; +- later consumers can reuse daemon A's already-prepared dense view within the TTL window without reconstructing it again. + +## When Canonical Fallback Is Still Reasonable + +Canonical fallback is still a valid compatibility path. It should remain available, but it should only be hit for real capability or availability reasons. + +Reasonable fallback cases include: + +- no `view_id` was requested, so canonical transport is the intended behavior; +- the initial `RequestReplicaTransport(view)` probe does not find a resident view quickly enough; +- the Global Store is older or view-unaware and cannot route resident views yet; +- the selected source route is not upgradeable: + - source daemon has no routable daemon gRPC address or port; + - the selected source is actually local / stale and must be reselected; + - source-side `MaterializeReplica` fails; + - source-side `WaitReplicaStatus` fails or does not converge within the request budget; + - daemon A prepares the view, but the resident-view route does not appear in the Global Store before the settle timeout; + - daemon A cannot admit a new derived view even after evicting eligible idle derived views; + - verification or policy requirements for the requested view cannot be satisfied; +- mixed-version rollout requires falling back to canonical transport for correctness. + +In short: canonical fallback is correct as a compatibility and recovery path, but it is not the desired steady-state path for TP-sliced remote loads. + +If relay benchmarks repeatedly hit canonical fallback while all daemons and the Global Store are new enough, routable, and healthy, that should be treated as a bug or timeout-tuning problem rather than normal behavior. + +# Why This Improves Performance + +The current canonical fallback path pays for two avoidable costs on daemon B: + +- read amplification: more bytes are fetched than the final view output size; +- local repack: strided blocks are gathered and packed on the destination daemon. + +With source-side view transport: + +- network payload approaches `view_size_bytes` instead of amplified canonical reads; +- local repack on daemon B disappears for the transport path; +- the source daemon can use its local-memory fast paths when deriving the view from a canonical replica already resident in local DRAM/VRAM. + +For TP-sliced model loading this is the intended shape: prepare the TP slice where the canonical bytes already live, then ship the slice. + +# Interaction With Direct-Write and Local Fast Paths + +This proposal is intentionally compatible with existing local fast paths. + +- Local replica export already benefits from source-local view derivation because the source daemon has direct access to canonical memory. +- Remote relay should converge to the same shape by making daemon A, not daemon B, own the `canonical -> view` transformation. +- This is particularly important for byte-mapped strided views, where the destination daemon currently pays the amplification and pack overhead that the source daemon could avoid or reduce with local-memory access. + +# Verification and Integrity + +Verification must remain byte-space correct. + +- If view verification metadata is available, the transport should carry view-scoped verification state such as `view_id`, `view_size_bytes`, `view_data_hash`, and any future view verification payloads. +- Canonical verification metadata must not be applied to a view payload. +- If `need_view_data_hash=false`, the system may skip view-hash verification, but it must still preserve view byte-space semantics and must not pretend that canonical proof validates the view payload. + +This keeps the transport aligned with the existing rule that view byte-space is distinct from canonical byte-space. + +# Lifecycle and Caching + +The recommended default is ephemeral derived-view export. + +- The source daemon creates a transport-scoped derived view source bound to a transport lease or TTL. +- The destination daemon may materialize and optionally register its own local view replica after ingestion. +- Promotion of source-side derived views to long-lived residency should remain policy-driven, not automatic. + +This avoids polluting global metadata with one-off TP slice transports while still allowing explicit caching or prewarm workflows to publish durable view replicas. + +# Compatibility and Rollout + +The rollout should preserve current mixed-version behavior. + +## Phase 1 Rollout + +- Implement and enable view residency publication. +- Allow `request_view_transport(view)` to route already-resident views. +- Keep canonical fallback unchanged. + +## Phase 2 Rollout + +- Extend `request_view_transport(view)` to support derivation-capable canonical sources. +- Add the source-daemon path that exports an ephemeral derived view source. +- Keep canonical fallback as the final compatibility path when either: + - the Global Store is view-unaware; + - the selected source daemon does not support derived-view export; + - verification or capability requirements for the requested view cannot be met. + +This preserves correctness during partial rollout while moving the performance-critical path toward source-side view transport. + +# Suggested Response Model Changes + +`RequestReplicaTransport(view)` can remain the external entrypoint, but the response should distinguish route kind explicitly. For example, the response should identify whether the selected source is: + +- `resident_view` +- `derived_view_from_canonical` +- `canonical_fallback` + +This route kind should be observable and should feed both metrics and debugging logs. The destination daemon should not need to infer route semantics indirectly from missing metadata. + +# Observability Additions + +Add route-aware observability so performance debugging becomes straightforward: + +- transport route kind: `resident_view`, `derived_view_from_canonical`, `canonical_fallback` +- source byte-space kind and identifier +- source-side derivation duration +- destination-side reconstruction duration, expected to be zero for routed view transport +- bytes sent on the wire versus final view bytes +- fallback reason buckets for view transport misses + +The steady-state success signal for this design is: + +- `request_view_transport(view)` succeeds without canonical fallback; +- daemon B no longer reports canonical read amplification for TP-sliced remote loads; +- the dominant time moves from destination-side reconstruction to either source-side derivation or pure transport. From 4a8d5013556bb06a20cde0067e2ccf81f34d9ba9 Mon Sep 17 00:00:00 2001 From: zhouyuhan Date: Thu, 19 Mar 2026 14:03:32 +0800 Subject: [PATCH 08/10] feat(daemon): implement and test source-side remote view export and lifecycle management --- core/common/memory/pinned_memory_authority.cc | 7 +- core/common/memory/pinned_memory_authority.h | 2 + core/store/README.md | 2 +- core/store/communication_types.h | 1 + core/store/components/global_store_client.cc | 96 +- core/store/components/global_store_client.h | 9 + core/store/materialization/control/BUILD | 2 + .../control/materialize_orchestrator.cc | 544 +++++++++- .../materialization/dataplane/runtime/pump.cc | 38 +- .../sources/byte_range_mapped_source.cc | 41 +- .../runtime/pipeline/allocation_stage.cc | 1 + .../runtime/pipeline/verification_stage.cc | 19 +- core/store/replica/replica.cc | 13 +- core/store/replica/replica.h | 2 + core/store/replica/replica_config.h | 1 + .../ingestion/materialization_facade.cc | 131 ++- .../testing/recording_global_store_client.h | 9 + daemon/BUILD | 52 + daemon/app/daemon_app.cc | 6 +- daemon/app/server_main.cc | 21 +- daemon/ha/worker_lifecycle_manager.cc | 29 +- .../controllers/materialization_controller.cc | 2 + .../controllers/materialization_controller.h | 2 + .../materialization_request_common_utils.cc | 3 + .../replica_materialization_service.cc | 341 ++++++- .../replica_materialization_service.h | 4 + .../controllers/transport_controller.cc | 84 +- .../controllers/transport_controller.h | 12 + daemon/service/grpc_service_impl.h | 10 + .../grpc_service_impl_rpc_delegates.cc | 16 + daemon/state/daemon_kernel.cc | 10 +- daemon/state/daemon_kernel.h | 6 + daemon/state/daemon_options.h | 7 + daemon/state/derived_view_export_manager.cc | 952 ++++++++++++++++++ daemon/state/derived_view_export_manager.h | 166 +++ ...rived_view_export_manager_eviction_test.cc | 213 ++++ .../state/derived_view_export_manager_test.cc | 232 +++++ daemon/state/sweep_tasks.h | 8 +- daemon/state/transport_lock_manager.h | 11 + daemon/testing/daemon_service_harness.cc | 8 +- .../0086-source-side-remote-view-transport.md | 192 ++++ .../tensorcast/config/v1/daemon_config.proto | 17 + proto/tensorcast/daemon/v2/store_daemon.proto | 23 + .../global_store/v1/global_store.proto | 16 + tensorcast/global_store/grpc_service.py | 2 + .../repositories/view_repository.py | 13 +- .../repositories/worker_repository.py | 19 + .../global_store/rpc/transport_rpc_handler.py | 89 +- .../rpc/view_proof_rpc_handler.py | 2 - .../services/transport_service.py | 51 +- .../python/global_store/test_grpc_service.py | 166 +++ 51 files changed, 3565 insertions(+), 138 deletions(-) create mode 100644 daemon/state/derived_view_export_manager.cc create mode 100644 daemon/state/derived_view_export_manager.h create mode 100644 daemon/state/derived_view_export_manager_eviction_test.cc create mode 100644 daemon/state/derived_view_export_manager_test.cc create mode 100644 docs/plans/0086-source-side-remote-view-transport.md diff --git a/core/common/memory/pinned_memory_authority.cc b/core/common/memory/pinned_memory_authority.cc index 31fc348ed..4ec9988bc 100644 --- a/core/common/memory/pinned_memory_authority.cc +++ b/core/common/memory/pinned_memory_authority.cc @@ -87,9 +87,12 @@ absl::Status PinnedMemoryAuthority::validate_and_build_pools() { cfg_.total_bytes = sum_pool_bytes; for (const auto& cls : classes_) { + PinnedBufferPool::Options pool_opts; + pool_opts.name = cls.name; + pool_opts.numa_node = cls.numa_node; + pool_opts.prefault = cls.numa_prefault; pools_.push_back( - std::make_shared( - static_cast(cls.pool_bytes), cls.slice_bytes, /*name=*/std::string(cls.name))); + std::make_shared(static_cast(cls.pool_bytes), cls.slice_bytes, std::move(pool_opts))); } return absl::OkStatus(); diff --git a/core/common/memory/pinned_memory_authority.h b/core/common/memory/pinned_memory_authority.h index 5f9f89d3d..c2623435a 100644 --- a/core/common/memory/pinned_memory_authority.h +++ b/core/common/memory/pinned_memory_authority.h @@ -25,6 +25,8 @@ class PinnedMemoryAuthority { // Phase 1 fixed-allocation: fully preallocated capacity of this class pool. uint64_t pool_bytes = 0; bool rdma_preregister = false; + int numa_node = -1; + bool numa_prefault = false; }; struct Config { diff --git a/core/store/README.md b/core/store/README.md index 7f65c94b4..3e6004906 100644 --- a/core/store/README.md +++ b/core/store/README.md @@ -464,7 +464,7 @@ Implementation: `try_evict_gpu_memory_impl()` in store_engine.cc. CPU VS memory - Content-addressed `mi2:...` artifact IDs require an authoritative source binding (Global Store routing, managed shared-disk location, or daemon-local disk import). Core no longer accepts string path hints. - Per-GPU transfer concurrency is limited to 1 active session by design to reduce VRAM fragmentation and pressure. - Canonical tensor indices are expected in schema version `"v3"`; the engine emits v3 descriptors and rejects older schemas on write paths. -- View replica registration still publishes canonical residency to Global Store. The engine issues a best-effort `record_view_residency` call so the daemon can start wiring the future RPC; until Global Store implements it the call returns `UNIMPLEMENTED` and is logged at `VLOG(1)`. +- View replica registration publishes minimal view residency to Global Store via `record_view_residency`. This residency record is queryable even when only `view_id`, `view_size`, and optional `view_data_hash` are known; explicit view registration paths still carry the full `view_spec_json` and coverage metadata. For broader architectural context, see docs/architecture.md and docs/state-management.md. - Verification metadata: canonical replicas reuse `verification.json`. Views persist per-view metadata under `verification.view_.json`; each record carries the `byte_space_id` so canonical metadata is never reused for a view. Every persisted payload now embeds a `metadata_signature` (canonical SHA-256 of the serialized payload). The loader re-reads the on-disk JSON on every materialization, validates the signature, and compares the payload fingerprint against any cached entry before reuse. Tampered or truncated files trigger `DataLoss` (cache is invalidated) and force regeneration, while cached entries are only reused when the file is absent and a fresh persistence will rewrite it. P2P senders may still provide inline `verification_json`; the backend `ingest_from_p2p()` path now flows through the runtime pipeline, which performs fast KEY_POINTS verification of the loaded replica (CPU/GPU). Verification failure returns a `DataLoss` error and aborts materialization. All metadata reads/writes are serialized through `VerificationMetadataGuard`, persisted via an atomic write helper (`open` → `write` → `fsync` → `rename` + directory `fsync`), and accompanied by structured `verification_metadata_write_{succeeded,failed}` logs that surface artifact, byte-space, guard wait, and write durations. diff --git a/core/store/communication_types.h b/core/store/communication_types.h index a3167a737..4c9cfa5fb 100644 --- a/core/store/communication_types.h +++ b/core/store/communication_types.h @@ -35,6 +35,7 @@ struct P2PSource { std::vector memory_keys; std::vector buf_sizes; bool enable_checksum = true; + bool source_is_view = false; Location location; // Optional verification metadata (JSON) passed from the sender side, e.g., diff --git a/core/store/components/global_store_client.cc b/core/store/components/global_store_client.cc index cc1d03367..3f84c82c5 100644 --- a/core/store/components/global_store_client.cc +++ b/core/store/components/global_store_client.cc @@ -99,6 +99,57 @@ const char* status_to_cstr(global_store::Status s) { } } +std::optional convert_view_transport_metadata( + const global_store::RequestReplicaTransportResponse& response) { + if (!response.has_view_transport_metadata()) { + return std::nullopt; + } + const auto& proto = response.view_transport_metadata(); + if (proto.view_id().empty()) { + return std::nullopt; + } + ViewTransportMetadata metadata; + metadata.view_id = proto.view_id(); + metadata.view_size_bytes = proto.view_size_bytes(); + if (proto.has_view_data_hash() && !proto.view_data_hash().empty()) { + metadata.view_data_hash = proto.view_data_hash(); + } + return metadata; +} + +global_store::TransportRouteKind normalize_transport_route_kind( + global_store::TransportRouteKind route_kind, + const RemoteReplicaInfo& remote_replica, + std::optional requested_view_id) { + if (route_kind != global_store::TRANSPORT_ROUTE_KIND_UNSPECIFIED) { + return route_kind; + } + if (requested_view_id.has_value() && remote_replica.view_id.has_value() && + *remote_replica.view_id == std::string(*requested_view_id)) { + return global_store::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW; + } + return global_store::TRANSPORT_ROUTE_KIND_CANONICAL; +} + +std::optional normalize_view_transport_metadata( + const global_store::RequestReplicaTransportResponse& response, + const RemoteReplicaInfo& remote_replica, + std::optional requested_view_id) { + auto metadata = convert_view_transport_metadata(response); + if (metadata.has_value()) { + return metadata; + } + if (requested_view_id.has_value() && remote_replica.view_id.has_value() && + *remote_replica.view_id == std::string(*requested_view_id)) { + return ViewTransportMetadata{ + .view_id = *remote_replica.view_id, + .view_size_bytes = remote_replica.memory_size, + .view_data_hash = std::nullopt, + }; + } + return std::nullopt; +} + global_store::TransportCompletionOutcome to_proto_transport_completion_outcome(TransportCompletionOutcome outcome) { switch (outcome) { case TransportCompletionOutcome::kSuccess: @@ -865,14 +916,27 @@ absl::Status GlobalStoreClient::record_view_residency( std::string_view view_id, uint64_t view_size_bytes, std::optional view_data_hash) { - (void)canonical_artifact_id; - (void)view_id; - (void)view_size_bytes; - (void)view_data_hash; - // A dedicated RPC for view metadata will be introduced for view-residency signals. - // Until that lands, treat this as a best-effort noop so core plumbing can wire - // the call sites without coupling to server availability. - return absl::UnimplementedError("Global Store view residency RPC not yet implemented"); + if (!is_connected()) { + return absl::FailedPreconditionError("GlobalStoreClient not connected"); + } + if (canonical_artifact_id.empty()) { + return absl::InvalidArgumentError("record_view_residency requires canonical_artifact_id"); + } + if (view_id.empty()) { + return absl::InvalidArgumentError("record_view_residency requires view_id"); + } + if (view_size_bytes == 0) { + return absl::InvalidArgumentError("record_view_residency requires view_size_bytes > 0"); + } + + ViewStateUpdate update; + update.artifact_id = std::string(canonical_artifact_id); + update.view_id = std::string(view_id); + update.view_size_bytes = view_size_bytes; + if (view_data_hash.has_value() && !view_data_hash->empty()) { + update.view_data_hash = std::string(*view_data_hash); + } + return update_artifact_view_state(update); } absl::Status GlobalStoreClient::update_artifact_view_state(const ViewStateUpdate& update) { @@ -1643,6 +1707,10 @@ absl::Status GlobalStoreClient::unregister_replica(std::string_view artifact_id, } if (response.status() != global_store::STATUS_OK) { + if (response.status() == global_store::STATUS_NOT_FOUND) { + return absl::NotFoundError( + absl::StrFormat("UnregisterReplica target not found: artifact_id=%s replica_id=%s", artifact_id, replica_id)); + } return absl::InternalError(absl::StrFormat("UnregisterReplica failed with status: %d", response.status())); } @@ -1861,7 +1929,10 @@ absl::StatusOr GlobalStoreClient::request_replica_transport( TransportSession session; session.transport_id = response.transport_id(); session.remote_replica = convert_from_proto_memory_info(response.remote_memory_info()); + session.remote_replica.grpc_port = response.source_grpc_port(); session.start_time = absl::Now(); + session.route_kind = normalize_transport_route_kind(response.route_kind(), session.remote_replica, std::nullopt); + session.view_transport_metadata = normalize_view_transport_metadata(response, session.remote_replica, std::nullopt); LOG(INFO) << "Started P2P transport " << session.transport_id << " from " << session.remote_replica.node_id; @@ -1953,7 +2024,10 @@ absl::StatusOr GlobalStoreClient::request_view_transport( TransportSession session; session.transport_id = response.transport_id(); session.remote_replica = convert_from_proto_memory_info(response.remote_memory_info()); + session.remote_replica.grpc_port = response.source_grpc_port(); session.start_time = absl::Now(); + session.route_kind = normalize_transport_route_kind(response.route_kind(), session.remote_replica, view_id); + session.view_transport_metadata = normalize_view_transport_metadata(response, session.remote_replica, view_id); LOG(INFO) << "Started view transport " << session.transport_id << " from " << session.remote_replica.node_id << " view_id=" << view_id; @@ -2534,9 +2608,13 @@ absl::StatusOr GlobalStoreClient::resolve_key_mapping_with_options( if (!status.ok()) { return status; } - if (response.status() != global_store::STATUS_OK) { + if (response.status() == global_store::STATUS_NOT_FOUND) { return absl::NotFoundError("key not found"); } + if (response.status() != global_store::STATUS_OK) { + return absl::InternalError( + absl::StrFormat("ResolveKeyMapping failed with global-store status=%d", static_cast(response.status()))); + } KeyMapping out{ .artifact_id = response.artifact_id(), diff --git a/core/store/components/global_store_client.h b/core/store/components/global_store_client.h index 0452777f3..ea7ea6848 100644 --- a/core/store/components/global_store_client.h +++ b/core/store/components/global_store_client.h @@ -88,6 +88,7 @@ struct RemoteReplicaInfo { std::string node_id; std::string node_address; uint32_t node_port; + uint32_t grpc_port{0}; uint64_t memory_size; common::memory::MemoryLocation memory_type; uint32_t device_id; @@ -97,11 +98,19 @@ struct RemoteReplicaInfo { std::optional view_id; }; +struct ViewTransportMetadata { + std::string view_id; + uint64_t view_size_bytes{0}; + std::optional view_data_hash; +}; + // Transport session for P2P transfers struct TransportSession { std::string transport_id; RemoteReplicaInfo remote_replica; absl::Time start_time; + global_store::TransportRouteKind route_kind{global_store::TRANSPORT_ROUTE_KIND_UNSPECIFIED}; + std::optional view_transport_metadata; }; struct TransportSchedulingGroupHint { diff --git a/core/store/materialization/control/BUILD b/core/store/materialization/control/BUILD index 47847acd2..3841be796 100644 --- a/core/store/materialization/control/BUILD +++ b/core/store/materialization/control/BUILD @@ -40,6 +40,8 @@ sc_cc_library( "//core/store:global_store_client", "//core/store:worker_identity", "//core/store/materialization/contracts:loading_spec", + "//proto/tensorcast/daemon/v2:daemon_grpc_cc", + "@abseil-cpp//absl/cleanup", "@abseil-cpp//absl/log:absl_log", "@abseil-cpp//absl/strings", "@gsl", diff --git a/core/store/materialization/control/materialize_orchestrator.cc b/core/store/materialization/control/materialize_orchestrator.cc index d925a37b9..39aa3e0bf 100644 --- a/core/store/materialization/control/materialize_orchestrator.cc +++ b/core/store/materialization/control/materialize_orchestrator.cc @@ -9,17 +9,23 @@ #include #include #include +#include #include + +#include "absl/cleanup/cleanup.h" #include "absl/log/log.h" #include "absl/status/status.h" +#include "absl/strings/match.h" #include "absl/strings/str_cat.h" #include "core/store/components/global_store_client.h" #include "core/store/materialization/contracts/loading_spec.h" +#include "grpcpp/grpcpp.h" #include "opentelemetry/common/attribute_value.h" #include "opentelemetry/common/key_value_iterable_view.h" #include "opentelemetry/context/context.h" #include "opentelemetry/metrics/meter.h" #include "opentelemetry/metrics/provider.h" +#include "tensorcast/daemon/v2/store_daemon.grpc.pb.h" namespace tensorcast::store::materialization::control { @@ -35,6 +41,8 @@ namespace { constexpr uint32_t kDefaultTransportWaitTimeoutMs = 30000; constexpr uint32_t kViewTransportProbeTimeoutMs = 1000; +constexpr std::chrono::milliseconds kDerivedViewRouteSettleTimeout{10000}; +constexpr std::chrono::milliseconds kDerivedViewRouteSettleAfterRetryableWaitError{30000}; constexpr int kMaxReselectionAttemptsWithoutBudget = 64; constexpr std::chrono::milliseconds kMinReselectionBudget{1}; constexpr std::chrono::milliseconds kMinReselectionBackoff{50}; @@ -45,6 +53,8 @@ struct SourceReselectionMetrics { opentelemetry::nostd::shared_ptr> reselection_attempt_total; opentelemetry::nostd::shared_ptr> reselection_success_total; opentelemetry::nostd::shared_ptr> reselection_exhausted_total; + opentelemetry::nostd::shared_ptr> route_selected_total; + opentelemetry::nostd::shared_ptr> fallback_total; }; SourceReselectionMetrics& source_reselection_metrics() { @@ -55,6 +65,57 @@ SourceReselectionMetrics& source_reselection_metrics() { return metrics; } +std::string route_kind_to_string(tensorcast::global_store::v1::TransportRouteKind route_kind) { + switch (route_kind) { + case tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_CANONICAL: + return "canonical"; + case tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW: + return "resident_view"; + case tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_DERIVED_VIEW_FROM_CANONICAL: + return "derived_view_from_canonical"; + case tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_UNSPECIFIED: + return "unspecified"; + default: + break; + } + return absl::StrCat("unknown(", static_cast(route_kind), ")"); +} + +void record_route_selected( + tensorcast::global_store::v1::TransportRouteKind route_kind, + bool view_scoped, + bool canonical_fallback) { + try { + auto& metrics = source_reselection_metrics(); + if (!metrics.route_selected_total) { + metrics.route_selected_total = metrics.meter->CreateDoubleCounter("tc_view_transport_route_selected_total"); + } + std::map attrs; + attrs.emplace("route_kind", route_kind_to_string(route_kind)); + attrs.emplace("scope", std::string(view_scoped ? "view" : "canonical")); + attrs.emplace("canonical_fallback", canonical_fallback ? "true" : "false"); + metrics.route_selected_total->Add( + 1.0, opentelemetry::common::KeyValueIterableView(attrs), opentelemetry::context::Context{}); + } catch (...) { + } +} + +void record_view_transport_fallback(std::string_view reason, std::string_view stage, bool view_scoped) { + try { + auto& metrics = source_reselection_metrics(); + if (!metrics.fallback_total) { + metrics.fallback_total = metrics.meter->CreateDoubleCounter("tc_view_transport_fallback_total"); + } + std::map attrs; + attrs.emplace("reason", std::string(reason)); + attrs.emplace("stage", std::string(stage)); + attrs.emplace("scope", std::string(view_scoped ? "view" : "canonical")); + metrics.fallback_total->Add( + 1.0, opentelemetry::common::KeyValueIterableView(attrs), opentelemetry::context::Context{}); + } catch (...) { + } +} + void add_source_reselection_counter( const opentelemetry::nostd::shared_ptr>& counter, std::string_view reason, @@ -155,6 +216,11 @@ bool should_retry_source_selection(const absl::Status& status) { absl::IsDeadlineExceeded(status); } +bool is_retryable_grpc_status(const grpc::Status& status) { + return status.error_code() == grpc::StatusCode::UNAVAILABLE || + status.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED; +} + std::optional resolve_request_deadline(const MaterializeHints& hints) { if (hints.request_budget.count() <= 0) { return std::nullopt; @@ -201,6 +267,21 @@ uint32_t effective_transport_wait_timeout_ms( return std::max(1, clamp_timeout_to_u32_ms(timeout)); } +std::string budget_ms_label(std::chrono::milliseconds timeout) { + if (timeout == std::chrono::milliseconds::max()) { + return "unbounded"; + } + return std::to_string(timeout.count()); +} + +bool is_source_side_upgrade_terminal_timeout(const absl::Status& status) { + if (absl::IsDeadlineExceeded(status)) { + return true; + } + return (absl::IsFailedPrecondition(status) || absl::IsUnavailable(status)) && + absl::StrContains(status.message(), "budget exhausted"); +} + uint32_t view_transport_probe_timeout_ms(uint32_t transport_wait_timeout_ms) { if (transport_wait_timeout_ms == 0) { return 0; @@ -281,6 +362,110 @@ absl::Status source_reselection_exhausted_status( "source reselection budget exhausted for artifact_id=", artifact_id, " attempts=", reselection_attempt)); } +struct PreparedRemoteReplicaCleanup { + std::string daemon_address; + std::string replica_uuid; +}; + +struct RemoteReplicaFetchCleanup { + std::string daemon_address; + std::string transport_id; +}; + +absl::Status release_prepared_remote_replica(const PreparedRemoteReplicaCleanup& cleanup) { + auto channel = grpc::CreateChannel(cleanup.daemon_address, grpc::InsecureChannelCredentials()); + auto stub = tensorcast::daemon::v2::StoreDaemonService::NewStub(channel); + grpc::ClientContext release_ctx; + release_ctx.set_deadline(std::chrono::system_clock::now() + std::chrono::seconds(5)); + tensorcast::daemon::v2::ReleaseReplicaRequest release_req; + release_req.mutable_ticket()->set_replica_uuid(cleanup.replica_uuid); + tensorcast::daemon::v2::ReleaseReplicaResponse release_resp; + const grpc::Status release_status = stub->ReleaseReplica(&release_ctx, release_req, &release_resp); + if (!release_status.ok()) { + return absl::UnavailableError( + absl::StrCat( + "source daemon ReleaseReplica failed for derived view export: address=", + cleanup.daemon_address, + " replica_uuid=", + cleanup.replica_uuid, + " status=", + release_status.error_message())); + } + return absl::OkStatus(); +} + +tensorcast::daemon::v2::DeviceType to_daemon_device_type(common::memory::MemoryLocation location) { + switch (location) { + case common::memory::MemoryLocation::CPU: + return tensorcast::daemon::v2::DEVICE_TYPE_CPU; + case common::memory::MemoryLocation::GPU: + return tensorcast::daemon::v2::DEVICE_TYPE_GPU; + default: + return tensorcast::daemon::v2::DEVICE_TYPE_UNSPECIFIED; + } +} + +absl::StatusOr begin_remote_replica_fetch( + const components::TransportSession& session, + std::string_view artifact_id, + std::string_view view_id) { + if (session.remote_replica.grpc_port == 0 || session.remote_replica.node_address.empty()) { + return absl::UnimplementedError("source daemon fetch lifecycle requires routable daemon gRPC address"); + } + auto channel = grpc::CreateChannel( + absl::StrCat(session.remote_replica.node_address, ":", session.remote_replica.grpc_port), + grpc::InsecureChannelCredentials()); + auto stub = tensorcast::daemon::v2::StoreDaemonService::NewStub(channel); + grpc::ClientContext ctx; + ctx.set_deadline(std::chrono::system_clock::now() + std::chrono::seconds(5)); + tensorcast::daemon::v2::BeginReplicaFetchRequest req; + req.set_transport_id(session.transport_id); + req.set_artifact_id(std::string(artifact_id)); + req.set_view_id(std::string(view_id)); + req.set_device_type(to_daemon_device_type(session.remote_replica.memory_type)); + if (session.remote_replica.memory_type == common::memory::MemoryLocation::GPU) { + req.set_device_id(session.remote_replica.device_id); + } + tensorcast::daemon::v2::BeginReplicaFetchResponse resp; + const grpc::Status status = stub->BeginReplicaFetch(&ctx, req, &resp); + if (status.error_code() == grpc::StatusCode::UNIMPLEMENTED) { + return false; + } + if (!status.ok()) { + return absl::UnavailableError( + absl::StrCat( + "source daemon BeginReplicaFetch failed: transport_id=", + session.transport_id, + " status=", + status.error_message())); + } + return resp.managed(); +} + +absl::Status end_remote_replica_fetch(const RemoteReplicaFetchCleanup& cleanup, std::string_view reason) { + auto channel = grpc::CreateChannel(cleanup.daemon_address, grpc::InsecureChannelCredentials()); + auto stub = tensorcast::daemon::v2::StoreDaemonService::NewStub(channel); + grpc::ClientContext ctx; + ctx.set_deadline(std::chrono::system_clock::now() + std::chrono::seconds(5)); + tensorcast::daemon::v2::EndReplicaFetchRequest req; + req.set_transport_id(cleanup.transport_id); + req.set_reason(std::string(reason)); + tensorcast::daemon::v2::EndReplicaFetchResponse resp; + const grpc::Status status = stub->EndReplicaFetch(&ctx, req, &resp); + if (status.error_code() == grpc::StatusCode::UNIMPLEMENTED) { + return absl::OkStatus(); + } + if (!status.ok()) { + return absl::UnavailableError( + absl::StrCat( + "source daemon EndReplicaFetch failed: transport_id=", + cleanup.transport_id, + " status=", + status.error_message())); + } + return absl::OkStatus(); +} + } // namespace MaterializeOrchestrator::MaterializeOrchestrator( @@ -397,9 +582,12 @@ absl::StatusOr MaterializeOrchestrator::run( (absl::IsNotFound(view_transport_or.status()) || absl::IsUnimplemented(view_transport_or.status()) || absl::IsDeadlineExceeded(view_transport_or.status()))) { view_transport_status = view_transport_or.status(); - LOG(INFO) << "request_view_transport unavailable for artifact_id=" << artifact_id << " view_id=" << *view_id - << " within probe_timeout_ms=" << view_probe_timeout_ms - << "; retrying canonical transport route with wait_timeout_ms=" << wait_timeout_ms; + record_view_transport_fallback("view_transport_probe_unavailable", "request_transport", /*view_scoped=*/true); + LOG(INFO) << "route_kind=canonical_fallback" + << " fallback_reason=view_transport_probe_unavailable" + << " artifact_id=" << artifact_id << " view_id=" << *view_id + << " probe_status=" << view_transport_or.status() << " probe_timeout_ms=" << view_probe_timeout_ms + << " canonical_wait_timeout_ms=" << wait_timeout_ms; auto canonical_transport_or = gs_client_->request_replica_transport( artifact_id, local_identity_.node_id, @@ -427,6 +615,267 @@ absl::StatusOr MaterializeOrchestrator::run( transport_request_id); }; + struct UpgradedDerivedViewTransport { + components::TransportSession session; + PreparedRemoteReplicaCleanup cleanup; + }; + + auto try_upgrade_derived_view_transport = [&](const components::TransportSession& source_session) + -> absl::StatusOr> { + if (!view_id.has_value()) { + return std::optional(); + } + if (source_session.route_kind == tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW) { + return std::optional(); + } + if (is_local_replica(source_session.remote_replica, local_identity_)) { + return std::optional(); + } + if (source_session.remote_replica.grpc_port == 0 || source_session.remote_replica.node_address.empty()) { + return absl::UnavailableError("source-side derived view transport requires a routable source daemon gRPC port"); + } + + auto fill_proto_view_spec = [&](tensorcast::common::v1::ViewSpec* out) { + out->clear_tensors(); + if (!hints.variant.has_value() || !hints.variant->view_spec.has_value()) { + return; + } + for (const auto& [tensor_name, tensor_ops] : hints.variant->view_spec->tensors) { + auto& proto_tensor_ops = (*out->mutable_tensors())[tensor_name]; + for (const auto& op : tensor_ops.ops) { + auto* proto_op = proto_tensor_ops.add_ops(); + switch (op.kind) { + case store::materialization::view::ViewOp::Kind::kNarrow: { + auto* narrow = proto_op->mutable_narrow(); + narrow->set_dim(op.narrow.dim); + narrow->set_start(op.narrow.start); + narrow->set_length(op.narrow.length); + break; + } + case store::materialization::view::ViewOp::Kind::kTranspose: { + auto* transpose = proto_op->mutable_transpose(); + transpose->set_dim0(op.transpose.dim0); + transpose->set_dim1(op.transpose.dim1); + break; + } + } + } + } + }; + + const uint32_t wait_timeout_ms = effective_transport_wait_timeout_ms(hints, request_deadline); + if (wait_timeout_ms == 0) { + return absl::DeadlineExceededError( + absl::StrCat("transport wait budget exhausted before source-side derive: artifact_id=", artifact_id)); + } + const std::string daemon_address = + absl::StrCat(source_session.remote_replica.node_address, ":", source_session.remote_replica.grpc_port); + const std::chrono::milliseconds remaining_before_upgrade = remaining_request_budget(request_deadline); + LOG(INFO) << "event=source_side_upgrade_budget" + << " artifact_id=" << artifact_id << " view_id=" << *view_id << " route_kind=derived_view_from_canonical" + << " request_budget_ms=" << hints.request_budget.count() + << " transport_wait_timeout_ms=" << hints.transport_wait_timeout.count() + << " remaining_request_budget_ms=" << budget_ms_label(remaining_before_upgrade) + << " source_prepare_wait_budget_ms=" << wait_timeout_ms << " source_daemon=" << daemon_address; + const std::string prepare_replica_uuid = absl::StrCat("derived-view-export:", source_session.transport_id); + tensorcast::daemon::v2::MaterializeReplicaRequest prepare_req; + prepare_req.mutable_selection()->set_artifact_id(std::string(artifact_id)); + prepare_req.mutable_selection()->set_view_id(*view_id); + fill_proto_view_spec(prepare_req.mutable_selection()->mutable_view_spec()); + prepare_req.set_replica_uuid(prepare_replica_uuid); + prepare_req.set_target_device_type(tensorcast::daemon::v2::DEVICE_TYPE_CPU); + prepare_req.set_size_bytes( + source_session.view_transport_metadata.has_value() ? source_session.view_transport_metadata->view_size_bytes + : source_session.remote_replica.memory_size); + prepare_req.mutable_source_policy()->set_preference(tensorcast::daemon::v2::SOURCE_PREFERENCE_AUTO); + prepare_req.mutable_source_policy()->set_allow_p2p(true); + prepare_req.mutable_source_policy()->set_allow_disk(false); + prepare_req.set_lease_mode(tensorcast::daemon::v2::LEASE_MODE_NO_LEASE); + prepare_req.set_export_policy(tensorcast::daemon::v2::EXPORT_POLICY_FORCE); + prepare_req.set_wait_for_completion(false); + prepare_req.set_need_view_data_hash(hints.need_view_data_hash); + + tensorcast::daemon::v2::MaterializeReplicaResponse prepare_resp; + auto call_prepare_materialize = [&]() -> grpc::Status { + auto channel = grpc::CreateChannel(daemon_address, grpc::InsecureChannelCredentials()); + auto stub = tensorcast::daemon::v2::StoreDaemonService::NewStub(channel); + grpc::ClientContext prepare_ctx; + prepare_ctx.set_deadline(std::chrono::system_clock::now() + std::chrono::milliseconds(wait_timeout_ms)); + prepare_resp.Clear(); + return stub->MaterializeReplica(&prepare_ctx, prepare_req, &prepare_resp); + }; + + grpc::Status prepare_status; + for (int attempt = 1; attempt <= 2; ++attempt) { + prepare_status = call_prepare_materialize(); + if (prepare_status.ok()) { + break; + } + if (attempt == 2 || !is_retryable_grpc_status(prepare_status)) { + return absl::UnavailableError( + absl::StrCat( + "source daemon MaterializeReplica failed for derived view export: address=", + daemon_address, + " status=", + prepare_status.error_message())); + } + LOG(INFO) << "Retrying source daemon MaterializeReplica for derived view export after retryable gRPC error: " + << "address=" << daemon_address << " attempt=" << attempt + << " status=" << prepare_status.error_message(); + } + if (prepare_resp.status() != tensorcast::daemon::v2::MATERIALIZE_REPLICA_STATUS_ALLOCATED) { + return absl::FailedPreconditionError( + absl::StrCat( + "source daemon MaterializeReplica did not allocate derived view export: address=", + daemon_address, + " status=", + static_cast(prepare_resp.status()))); + } + if (!prepare_resp.has_ticket() || prepare_resp.ticket().replica_uuid().empty()) { + return absl::FailedPreconditionError("source daemon derived view export did not return a ticket"); + } + PreparedRemoteReplicaCleanup cleanup{ + .daemon_address = daemon_address, + .replica_uuid = prepare_resp.ticket().replica_uuid(), + }; + auto cleanup_guard = absl::MakeCleanup([&]() { + const absl::Status release_status = release_prepared_remote_replica(cleanup); + if (!release_status.ok()) { + LOG(WARNING) << "best-effort release of derived view export failed during upgrade: " << release_status; + } + }); + + tensorcast::daemon::v2::WaitReplicaStatusRequest wait_req; + wait_req.mutable_ticket()->set_replica_uuid(prepare_resp.ticket().replica_uuid()); + wait_req.set_timeout_ms(wait_timeout_ms); + tensorcast::daemon::v2::WaitReplicaStatusResponse wait_resp; + auto call_wait_replica_status = [&]() -> grpc::Status { + auto channel = grpc::CreateChannel(daemon_address, grpc::InsecureChannelCredentials()); + auto stub = tensorcast::daemon::v2::StoreDaemonService::NewStub(channel); + grpc::ClientContext wait_ctx; + wait_ctx.set_deadline(std::chrono::system_clock::now() + std::chrono::milliseconds(wait_timeout_ms)); + wait_resp.Clear(); + return stub->WaitReplicaStatus(&wait_ctx, wait_req, &wait_resp); + }; + + grpc::Status wait_status; + bool wait_status_retryable_unknown = false; + for (int attempt = 1; attempt <= 2; ++attempt) { + wait_status = call_wait_replica_status(); + if (wait_status.ok()) { + break; + } + if (attempt == 2 || !is_retryable_grpc_status(wait_status)) { + if (is_retryable_grpc_status(wait_status)) { + wait_status_retryable_unknown = true; + LOG(INFO) << "source daemon WaitReplicaStatus remained retryable for derived view export; " + << "probing resident view route directly: address=" << daemon_address + << " status=" << wait_status.error_message(); + break; + } + return absl::UnavailableError( + absl::StrCat( + "source daemon WaitReplicaStatus failed for derived view export: address=", + daemon_address, + " status=", + wait_status.error_message())); + } + LOG(INFO) << "Retrying source daemon WaitReplicaStatus for derived view export after retryable gRPC error: " + << "address=" << daemon_address << " attempt=" << attempt << " status=" << wait_status.error_message(); + } + if (!wait_status_retryable_unknown && + wait_resp.status().state() != tensorcast::daemon::v2::REPLICA_OPERATION_STATE_SUCCESS) { + if (wait_resp.status().state() == tensorcast::daemon::v2::REPLICA_OPERATION_STATE_DEGRADED && + absl::StrContains(wait_resp.status().message(), "budget exhausted")) { + return absl::DeadlineExceededError( + absl::StrCat( + "source daemon derived view export wait budget exhausted: address=", + daemon_address, + " message=", + wait_resp.status().message())); + } + return absl::FailedPreconditionError( + absl::StrCat( + "source daemon derived view export did not become ready: state=", + static_cast(wait_resp.status().state()), + " message=", + wait_resp.status().message())); + } + + const std::chrono::milliseconds remaining = remaining_request_budget(request_deadline); + std::chrono::milliseconds settle_budget; + if (remaining == std::chrono::milliseconds::max()) { + settle_budget = wait_status_retryable_unknown ? kDerivedViewRouteSettleAfterRetryableWaitError + : kDerivedViewRouteSettleTimeout; + } else { + settle_budget = std::max(kMinReselectionBudget, remaining); + } + LOG(INFO) << "event=source_side_upgrade_route_settle_budget" + << " artifact_id=" << artifact_id << " view_id=" << *view_id << " route_kind=derived_view_from_canonical" + << " remaining_request_budget_ms=" << budget_ms_label(remaining) + << " settle_budget_ms=" << budget_ms_label(settle_budget) + << " wait_status_retryable_unknown=" << (wait_status_retryable_unknown ? "true" : "false"); + const auto settle_deadline = std::chrono::steady_clock::now() + settle_budget; + int resident_view_attempt = 0; + int last_route_kind = 0; + while (true) { + const auto now = std::chrono::steady_clock::now(); + if (now >= settle_deadline) { + return absl::DeadlineExceededError( + absl::StrCat( + "source-side derived view export completed but resident view route did not appear within " + "settle_timeout_ms=", + settle_budget.count(), + " last_route_kind=", + last_route_kind)); + } + const auto poll_remaining = std::chrono::duration_cast(settle_deadline - now); + const uint32_t poll_wait_timeout_ms = std::max(1, clamp_timeout_to_u32_ms(poll_remaining)); + auto upgraded_view_transport_or = gs_client_->request_view_transport( + artifact_id, + *view_id, + local_identity_.node_id, + local_identity_.node_address, + local_identity_.p2p_port, + target_device, + poll_wait_timeout_ms, + scheduling_group_hint, + requester_worker_id, + std::string_view()); + if (!upgraded_view_transport_or.ok()) { + if (!should_retry_source_selection(upgraded_view_transport_or.status())) { + return upgraded_view_transport_or.status(); + } + ++resident_view_attempt; + VLOG(1) << "resident view route not ready after source-side prepare: artifact_id=" << artifact_id + << " view_id=" << *view_id << " attempt=" << resident_view_attempt + << " status=" << upgraded_view_transport_or.status(); + std::this_thread::sleep_for(kMinReselectionBackoff); + continue; + } + last_route_kind = static_cast(upgraded_view_transport_or->route_kind); + if (upgraded_view_transport_or->route_kind == tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW) { + std::move(cleanup_guard).Cancel(); + return std::optional(UpgradedDerivedViewTransport{ + .session = std::move(*upgraded_view_transport_or), + .cleanup = std::move(cleanup), + }); + } + + absl::Status cancel_status = gs_client_->complete_replica_transport( + upgraded_view_transport_or->transport_id, + components::TransportCompletionOutcome::kCancelled, + "waiting_for_resident_view_after_source_prepare"); + if (!cancel_status.ok()) { + LOG(WARNING) << "complete_replica_transport while waiting for resident view returned error: " << cancel_status; + } + ++resident_view_attempt; + VLOG(1) << "resident view route still unresolved after source-side prepare: artifact_id=" << artifact_id + << " view_id=" << *view_id << " attempt=" << resident_view_attempt << " route_kind=" << last_route_kind; + std::this_thread::sleep_for(kMinReselectionBackoff); + } + }; + while (gs_connected && allow_p2p) { const std::chrono::milliseconds remaining = remaining_request_budget(request_deadline); if (remaining != std::chrono::milliseconds::max() && remaining < kMinReselectionBudget) { @@ -462,8 +911,65 @@ absl::StatusOr MaterializeOrchestrator::run( } break; } - const auto& session = *transport_or; + components::TransportSession session = *transport_or; + std::optional prepared_remote_cleanup; + std::optional remote_fetch_cleanup; + std::string remote_fetch_reason = "scope_exit"; + auto prepared_cleanup_guard = absl::MakeCleanup([&]() { + if (!prepared_remote_cleanup.has_value()) { + return; + } + const absl::Status release_status = release_prepared_remote_replica(*prepared_remote_cleanup); + if (!release_status.ok()) { + LOG(WARNING) << "best-effort release of derived view export failed after transport session: " << release_status; + } + }); + auto remote_fetch_cleanup_guard = absl::MakeCleanup([&]() { + if (!remote_fetch_cleanup.has_value()) { + return; + } + const absl::Status end_status = end_remote_replica_fetch(*remote_fetch_cleanup, remote_fetch_reason); + if (!end_status.ok()) { + LOG(WARNING) << "best-effort EndReplicaFetch failed after transport session: " << end_status; + } + }); + + if (view_id.has_value() && + session.route_kind == tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_DERIVED_VIEW_FROM_CANONICAL) { + auto upgraded_transport_or = try_upgrade_derived_view_transport(session); + if (!upgraded_transport_or.ok()) { + if (is_source_side_upgrade_terminal_timeout(upgraded_transport_or.status())) { + LOG(WARNING) << "event=source_side_upgrade_deadline_exhausted" + << " artifact_id=" << artifact_id << " view_id=" << *view_id << " route_kind=terminal_timeout" + << " terminal_reason=" << upgraded_transport_or.status(); + return upgraded_transport_or.status(); + } + used_canonical_transport_fallback = true; + record_view_transport_fallback("source_side_upgrade_unavailable", "upgrade_to_resident_view", true); + LOG(INFO) << "source-side derived view transport upgrade unavailable for artifact_id=" << artifact_id + << " view_id=" << *view_id << " route_kind=canonical_fallback" + << " fallback_reason=" << upgraded_transport_or.status() << "; continuing with canonical transport"; + } else if (upgraded_transport_or->has_value()) { + absl::Status cancel_status = gs_client_->complete_replica_transport( + session.transport_id, + components::TransportCompletionOutcome::kCancelled, + "replaced_by_resident_view_transport"); + if (!cancel_status.ok()) { + LOG(WARNING) << "complete_replica_transport for superseded canonical route returned error: " << cancel_status; + } + prepared_remote_cleanup = std::move((*upgraded_transport_or)->cleanup); + session = std::move((*upgraded_transport_or)->session); + } + } const auto& remote = session.remote_replica; + const std::string route_kind = route_kind_to_string(session.route_kind); + record_route_selected(session.route_kind, view_id.has_value(), used_canonical_transport_fallback); + LOG(INFO) << "event=route_selected" + << " artifact_id=" << artifact_id << " view_id=" << (view_id.has_value() ? *view_id : std::string()) + << " target_device=" << target_device.ordinal << " route_kind=" << route_kind + << " transport_id=" << session.transport_id << " remote_node=" << remote.node_address + << " remote_port=" << remote.node_port << " remote_grpc_port=" << remote.grpc_port + << " canonical_fallback=" << (used_canonical_transport_fallback ? "true" : "false"); if (is_local_replica(remote, local_identity_)) { record_stale_source_detected("local_route", view_id.has_value(), reselection_attempt); @@ -505,6 +1011,7 @@ absl::StatusOr MaterializeOrchestrator::run( p2p_src.buf_sizes = remote.buffer_sizes; p2p_src.verification_json = remote.verification_json; p2p_src.enable_checksum = true; + p2p_src.source_is_view = session.route_kind == tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW; p2p_src.location.type = remote.memory_type; p2p_src.location.device_id = remote.device_id; if (has_disk_source && allow_disk && preference != loading::SourcePreference::kPreferP2P) { @@ -517,7 +1024,30 @@ absl::StatusOr MaterializeOrchestrator::run( : common::memory::MemoryLocation::CPU; target.location.device_id = target_device.ordinal; + if (view_id.has_value() && session.route_kind == tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW && + !is_local_replica(remote, local_identity_)) { + auto begin_fetch_or = begin_remote_replica_fetch(session, artifact_id, *view_id); + if (!begin_fetch_or.ok()) { + LOG(WARNING) << "source-side fetch lifecycle unavailable for artifact_id=" << artifact_id + << " view_id=" << *view_id << " transport_id=" << session.transport_id << ": " + << begin_fetch_or.status(); + } else if (*begin_fetch_or) { + remote_fetch_cleanup = RemoteReplicaFetchCleanup{ + .daemon_address = absl::StrCat(remote.node_address, ":", remote.grpc_port), + .transport_id = session.transport_id, + }; + } + } + auto load_or = backend_->ingest_from_p2p(std::string(artifact_id), p2p_src, target, hints); + remote_fetch_reason = load_or.ok() ? "ingest_success" : "ingest_failure"; + if (remote_fetch_cleanup.has_value()) { + const absl::Status end_status = end_remote_replica_fetch(*remote_fetch_cleanup, remote_fetch_reason); + if (!end_status.ok()) { + LOG(WARNING) << "EndReplicaFetch after ingest_from_p2p failed: " << end_status; + } + remote_fetch_cleanup.reset(); + } if (load_or.ok()) { // Notify GS that transport finished absl::Status comp_status = gs_client_->complete_replica_transport( @@ -532,8 +1062,10 @@ absl::StatusOr MaterializeOrchestrator::run( } if (used_canonical_transport_fallback && view_id.has_value()) { - LOG(INFO) << "materialize_view loaded via canonical transport fallback: artifact_id=" << artifact_id - << " view_id=" << *view_id; + LOG(INFO) << "event=view_materialized_via_fallback" + << " route_kind=canonical_fallback" + << " artifact_id=" << artifact_id << " view_id=" << *view_id + << " transport_id=" << session.transport_id; } if (reselection_attempt > 0) { record_source_reselection_success("p2p_load", view_id.has_value(), reselection_attempt); diff --git a/core/store/materialization/dataplane/runtime/pump.cc b/core/store/materialization/dataplane/runtime/pump.cc index 68f12c302..1d4c3e2bd 100644 --- a/core/store/materialization/dataplane/runtime/pump.cc +++ b/core/store/materialization/dataplane/runtime/pump.cc @@ -4,11 +4,20 @@ #include #include +#include #include #include +#include +#include +#include #include #include +#include +#include +#include +#include + #include "absl/base/thread_annotations.h" #include "absl/cleanup/cleanup.h" #include "absl/log/absl_check.h" @@ -30,6 +39,26 @@ namespace tensorcast::store::loader { namespace { +pid_t current_tid() { + return static_cast(::syscall(SYS_gettid)); +} + +int current_cpu() { + return ::sched_getcpu(); +} + +int addr_numa_node(const void* addr) { + if (addr == nullptr) { + return -1; + } + int node = -1; + long rc = ::syscall(SYS_get_mempolicy, &node, nullptr, 0, const_cast(addr), MPOL_F_NODE | MPOL_F_ADDR); + if (rc != 0) { + return -1; + } + return node; +} + struct PumpState { std::atomic should_stop{false}; std::atomic drain_requested{false}; @@ -93,7 +122,7 @@ class SlotLease { }; static void run_consumer(PositionedSink& dst, BufferPool& pool, PumpState& state) { - VLOG(1) << "Consumer thread started"; + VLOG(1) << "Consumer thread started tid=" << current_tid() << " cpu=" << current_cpu(); bool draining = false; struct AsyncSlot { @@ -333,6 +362,7 @@ void run_range_producer( std::atomic& range_index, PumpState& state) { const absl::Time producer_start = absl::Now(); + VLOG(2) << "pump_producer_start tid=" << current_tid() << " cpu=" << current_cpu(); uint64_t produced_chunks = 0; uint64_t produced_bytes = 0; uint64_t wait_free_chunk_us_total = 0; @@ -446,14 +476,16 @@ void run_range_producer( VLOG(2) << "pump_producer_chunk range_index=" << idx << " chunk_id=" << chunk_id << " slot=" << slot_id << " src_offset=" << (current_offset - bytes_read) << " bytes=" << bytes_read << " wait_free_chunk_us=" << wait_free_us << " read_at_us=" << read_us - << " mark_chunk_ready_us=" << mark_ready_us << " remaining_in_range=" << remaining; + << " mark_chunk_ready_us=" << mark_ready_us << " remaining_in_range=" << remaining + << " tid=" << current_tid() << " cpu=" << current_cpu() << " buffer_numa=" << addr_numa_node(buffer); } } VLOG(2) << "pump_producer_summary ranges=" << ranges.size() << " produced_chunks=" << produced_chunks << " produced_bytes=" << produced_bytes << " wait_free_chunk_us_total=" << wait_free_chunk_us_total << " read_at_us_total=" << read_at_us_total << " mark_chunk_ready_us_total=" << mark_ready_us_total << " duration_us=" - << static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - producer_start))); + << static_cast(std::max(0, absl::ToInt64Microseconds(absl::Now() - producer_start))) + << " tid=" << current_tid() << " cpu=" << current_cpu(); } } // namespace diff --git a/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc b/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc index f524cdfa5..f39358419 100644 --- a/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc +++ b/core/store/materialization/dataplane/sources/byte_range_mapped_source.cc @@ -2,6 +2,10 @@ #include "core/store/materialization/dataplane/sources/byte_range_mapped_source.h" +#include +#include +#include +#include #include #include #include @@ -25,6 +29,26 @@ namespace tensorcast::store::loader { namespace { +pid_t current_tid() { + return static_cast(::syscall(SYS_gettid)); +} + +int current_cpu() { + return ::sched_getcpu(); +} + +int addr_numa_node(const void* addr) { + if (addr == nullptr) { + return -1; + } + int node = -1; + long rc = ::syscall(SYS_get_mempolicy, &node, nullptr, 0, const_cast(addr), MPOL_F_NODE | MPOL_F_ADDR); + if (rc != 0) { + return -1; + } + return node; +} + struct StridedBlock { size_t run_index{0}; uint64_t first_row{0}; @@ -295,11 +319,12 @@ absl::StatusOr ByteRangeMappedSource::read_base( stats_.base_read_bytes.fetch_add(*read_or, std::memory_order_relaxed); if (*read_or != bytes) { VLOG(2) << "byte_range.read_base short_read source_index=" << source_index << " offset=" << offset - << " requested=" << bytes << " got=" << *read_or << " duration_us=" << elapsed_us(start); + << " requested=" << bytes << " got=" << *read_or << " duration_us=" << elapsed_us(start) + << " tid=" << current_tid() << " cpu=" << current_cpu(); return absl::DataLossError("short read while executing byte range program"); } VLOG(2) << "byte_range.read_base ok source_index=" << source_index << " offset=" << offset << " bytes=" << bytes - << " duration_us=" << elapsed_us(start); + << " duration_us=" << elapsed_us(start) << " tid=" << current_tid() << " cpu=" << current_cpu(); return *read_or; } @@ -493,10 +518,15 @@ absl::StatusOr ByteRangeMappedSource::fill_strided_run( const uint64_t rows_touched = last_row_exclusive > first_row ? (last_row_exclusive - first_row) : 0; auto& source = sources_[run.source_index]; const uint8_t* cpu_source_base_ptr = source->cpu_base_ptr(); + const uint64_t first_src_offset = run.src_base + first_row * run.stride + (run_offset % run.row_len); VLOG(2) << "byte_range.strided.direct_gather_probe run_index=" << run_index << " source_index=" << run.source_index << " source_type=" << typeid(*source).name() << " has_cpu_base_ptr=" << (cpu_source_base_ptr != nullptr ? 1 : 0) << " row_len=" << run.row_len - << " stride=" << run.stride << " bytes=" << bytes << " rows_touched=" << rows_touched; + << " stride=" << run.stride << " bytes=" << bytes << " rows_touched=" << rows_touched + << " tid=" << current_tid() << " cpu=" << current_cpu() + << " source_base_numa=" << addr_numa_node(cpu_source_base_ptr) << " first_src_addr_numa=" + << addr_numa_node(cpu_source_base_ptr != nullptr ? cpu_source_base_ptr + first_src_offset : nullptr) + << " dst_numa=" << addr_numa_node(dst); const bool direct_gather_candidate = run.stride > run.row_len && run.row_len >= kDirectGatherMinRowLenBytes && bytes >= kDirectGatherMinTotalBytes && rows_touched <= kDirectGatherMaxRowsTouched && cpu_source_base_ptr != nullptr; @@ -542,7 +572,10 @@ absl::StatusOr ByteRangeMappedSource::fill_strided_run( stats_.base_read_bytes.fetch_add(copied, std::memory_order_relaxed); VLOG(2) << "byte_range.strided.direct_gather run_index=" << run_index << " source_index=" << run.source_index << " rows_touched=" << rows_touched << " bytes=" << copied << " memcpy_calls=" << direct_gather_memcpy_calls - << " duration_us=" << gather_us; + << " duration_us=" << gather_us << " tid=" << current_tid() << " cpu=" << current_cpu() + << " source_base_numa=" << addr_numa_node(cpu_source_base_ptr) << " first_src_addr_numa=" + << addr_numa_node(cpu_source_base_ptr != nullptr ? cpu_source_base_ptr + first_src_offset : nullptr) + << " dst_numa=" << addr_numa_node(dst); flush_local_stats(); return copied; } diff --git a/core/store/materialization/runtime/pipeline/allocation_stage.cc b/core/store/materialization/runtime/pipeline/allocation_stage.cc index d2f4d4a9e..fc8fe707e 100644 --- a/core/store/materialization/runtime/pipeline/allocation_stage.cc +++ b/core/store/materialization/runtime/pipeline/allocation_stage.cc @@ -78,6 +78,7 @@ absl::StatusOr build_replica_config(IngestionContext& ct config.source = ctx.p2p.source; config.local_device_id = ctx.target_device_id; config.p2p_comm_enabled = true; + config.source_is_view = ctx.p2p.source.source_is_view; } return config; diff --git a/core/store/materialization/runtime/pipeline/verification_stage.cc b/core/store/materialization/runtime/pipeline/verification_stage.cc index f3244b1e2..1f00989d5 100644 --- a/core/store/materialization/runtime/pipeline/verification_stage.cc +++ b/core/store/materialization/runtime/pipeline/verification_stage.cc @@ -268,14 +268,27 @@ absl::Status verify_disk(IngestionContext& ctx) { absl::Status verify_p2p(IngestionContext& ctx) { const auto& source = ctx.p2p.source; + const bool is_view_request = ctx.resolved_view_plan.has_value() && !ctx.resolved_view_plan->is_identity && + ctx.resolved_view_plan->view_size_bytes > 0; + const uint64_t expected_view_size = is_view_request ? ctx.resolved_view_plan->view_size_bytes : 0; if (!source.verification_json.empty()) { auto info_or = common::ArtifactVerificationInfo::from_json(source.verification_json); if (!info_or.ok()) { return absl::DataLossError("verification_json parse failed"); } - auto verify_status = ctx.replica->verify_key_points(ctx.target_location, *info_or); - if (!verify_status.ok()) { - return absl::DataLossError(std::string(verify_status.message())); + if (is_view_request && info_or->artifact_size != expected_view_size) { + LOG(INFO) << "verify_p2p: skipping incompatible canonical verification metadata for view materialization" + << " artifact_id=" << ctx.artifact_identifier << " expected_view_size=" << expected_view_size + << " verification_size=" << info_or->artifact_size << " view_id=" + << (ctx.hints.variant.has_value() && ctx.hints.variant->view_id.has_value() + ? *ctx.hints.variant->view_id + : std::string("")) + << " need_view_data_hash=" << ctx.hints.need_view_data_hash; + } else { + auto verify_status = ctx.replica->verify_key_points(ctx.target_location, *info_or); + if (!verify_status.ok()) { + return absl::DataLossError(std::string(verify_status.message())); + } } } diff --git a/core/store/replica/replica.cc b/core/store/replica/replica.cc index 956fb32a6..7a074347c 100644 --- a/core/store/replica/replica.cc +++ b/core/store/replica/replica.cc @@ -77,6 +77,7 @@ absl::StatusOr> Replica::create(ReplicaConfig config) { // --- Create Loader based on Source --- std::unique_ptr loader; common::memory::MemoryLocation source_type = common::memory::MemoryLocation::NONE; + bool source_is_view = false; absl::Status visitor_status = absl::OkStatus(); // Initialize status @@ -95,6 +96,7 @@ absl::StatusOr> Replica::create(ReplicaConfig config) { << p2p_source.port; loader = std::make_unique(p2p_source); source_type = common::memory::MemoryLocation::REMOTE; + source_is_view = p2p_source.source_is_view; return absl::OkStatus(); // Return OK status }, [&](const loading::InlineBufferSource& buffer_source) -> absl::Status { @@ -208,6 +210,7 @@ absl::StatusOr> Replica::create(ReplicaConfig config) { std::move(memory_manager), config.async_runtime, source_type, + source_is_view || config.source_is_view, std::move(view_plan), std::move(config.canonical_index_json), std::move(config.source_index_json), @@ -226,6 +229,7 @@ Replica::Replica( std::shared_ptr memory_manager, gsl::not_null> async_runtime, common::memory::MemoryLocation source_type, + bool source_is_view, std::optional view_plan, std::optional canonical_index_json, std::optional source_index_json, @@ -236,6 +240,7 @@ Replica::Replica( memory_manager_(std::move(memory_manager)), async_runtime_(std::move(async_runtime)), original_source_type_(source_type), + source_is_view_(source_is_view), view_plan_(std::move(view_plan)), canonical_index_json_(std::move(canonical_index_json)), source_index_json_(std::move(source_index_json)), @@ -381,8 +386,9 @@ folly::SemiFuture Replica::ensure_loaded_async( return ready_signal->subscribe(); } std::unique_ptr source_ptr = std::move(*src_or); + const bool source_is_view = source_location == MemoryLocation::REMOTE && source_is_view_; bool composed_view = false; - if (canonical_index_json_.has_value() && source_index_json_.has_value()) { + if (!source_is_view && canonical_index_json_.has_value() && source_index_json_.has_value()) { auto canonical_total_size = compute_total_size_from_index(*canonical_index_json_); if (!canonical_total_size.has_value()) { ready_signal->set_value(absl::FailedPreconditionError("canonical index total_size is unavailable")); @@ -424,13 +430,14 @@ folly::SemiFuture Replica::ensure_loaded_async( } source_ptr = std::move(*mapped_or); } - if (view_plan_.has_value() && !view_plan_->is_identity) { + if (!source_is_view && view_plan_.has_value() && !view_plan_->is_identity) { if (!composed_view) { source_ptr = loader::make_view_plan_source(std::move(source_ptr), view_plan_->selection, byte_mapping_config_); } } std::function post_load_fn; - if (view_plan_.has_value() && !view_plan_->is_identity && view_plan_->transform.requires_materialization && + if (!source_is_view && view_plan_.has_value() && !view_plan_->is_identity && + view_plan_->transform.requires_materialization && transform_placement_ == loading::TransformPlacement::kServer) { loader::TransformPlan transform_plan = view_plan_->transform; auto mm_shared = memory_manager_; diff --git a/core/store/replica/replica.h b/core/store/replica/replica.h index 4acfafe54..0c9034141 100644 --- a/core/store/replica/replica.h +++ b/core/store/replica/replica.h @@ -222,6 +222,7 @@ class Replica { std::shared_ptr memory_manager, gsl::not_null> async_runtime, common::memory::MemoryLocation source_type, + bool source_is_view, std::optional view_plan, std::optional canonical_index_json, std::optional source_index_json, @@ -240,6 +241,7 @@ class Replica { // Store the original source type for reference (e.g., to know if RDMA registration makes sense) const common::memory::MemoryLocation original_source_type_; + const bool source_is_view_; // Optional view execution plan when this replica represents a variant byte space. const std::optional view_plan_; diff --git a/core/store/replica/replica_config.h b/core/store/replica/replica_config.h index 0960c9aa0..36d085cea 100644 --- a/core/store/replica/replica_config.h +++ b/core/store/replica/replica_config.h @@ -64,6 +64,7 @@ struct ReplicaConfig { // Whether to enable P2P communication for this replica bool p2p_comm_enabled = false; + bool source_is_view = false; // Future runtime configurations can be added here: // - Variant residency metadata (view identifiers) diff --git a/core/store/runtime/ingestion/materialization_facade.cc b/core/store/runtime/ingestion/materialization_facade.cc index 2ee453bf9..62a3e23f6 100644 --- a/core/store/runtime/ingestion/materialization_facade.cc +++ b/core/store/runtime/ingestion/materialization_facade.cc @@ -59,6 +59,8 @@ namespace { constexpr uint32_t kDefaultTransportWaitTimeoutMs = 30000; constexpr uint32_t kViewTransportProbeTimeoutMs = 1000; +enum class MappedSourceByteSpace : uint8_t { kCanonical = 0, kView = 1 }; + bool is_local_identity(const components::WorkerIdentity& local) { return !local.node_id.empty() || !local.node_address.empty(); } @@ -114,6 +116,23 @@ uint32_t resolve_view_transport_probe_timeout_ms(uint32_t wait_timeout_ms) { return std::min(wait_timeout_ms, kViewTransportProbeTimeoutMs); } +loader::ByteRangeMap build_identity_map(uint64_t bytes) { + loader::ByteRangeMap map; + map.total_bytes = bytes; + map.num_sources = 1; + if (bytes > 0) { + map.segments.push_back( + loader::ByteRangeSegment{ + .kind = loader::ByteRangeSegment::Kind::kData, + .dst_offset = 0, + .length = bytes, + .src_offset = 0, + .source_index = 0, + }); + } + return map; +} + std::string make_materialize_into_target_pinned_wait_context( const loading::MaterializeHints& hints, int target_device_ordinal, @@ -1035,11 +1054,42 @@ absl::StatusOr MaterializationFacade::materialize_view_f return handle; }; + auto maybe_publish_local_view_completion = [&]() { + if (request.hints().export_policy == loading::ExportPolicy::kNever) { + return; + } + const auto publish_state = config_.replica_runtime->get_replica_publish_state(key); + if (publish_state == ReplicaPublishState::kPublishPending || publish_state == ReplicaPublishState::kPublished) { + return; + } + IngestionCompletedEvent event; + event.request_id = make_request_id("local_view"); + event.source = IngestionSource::kP2P; + event.materialize_mode = request.mode(); + event.export_policy = request.hints().export_policy; + event.artifact_id = key.artifact_id; + event.target_device = request.target_device(); + event.target_location = request.target_location(); + event.bytes_transferred = local_map.total_bytes; + event.duration_seconds = 0.0; + event.status = absl::OkStatus(); + event.replica_key = key; + event.view_id = request.requested_view_id(); + event.publish_to_global_store = true; + event.publish_context_id = config_.runtime_context->mint_publish_context_id(); + publish_completed_event(std::move(event)); + // Source-side prepare must not return before the resident-view replica is visible + // to subsequent route selection on the requesting worker. The ingestion event hub + // dispatches asynchronously, so drain here to close that publication race. + config_.runtime_context->drain_events(); + }; + auto existing_or = replica_registry.find(key); if (existing_or.ok()) { const auto& existing = existing_or.value(); absl::Status reuse_status = validate_existing_replica_for_reuse(existing, request.target_location()); if (reuse_status.ok()) { + maybe_publish_local_view_completion(); LOG(INFO) << "materialize_view.local_canonical_reuse artifact_id=" << request.canonical_artifact_id() << " view_id=" << view_id << " source_key=" << *selected_key; return build_local_view_handle(existing); @@ -1065,6 +1115,7 @@ absl::StatusOr MaterializationFacade::materialize_view_f if (!rebuild_status.ok()) { return rebuild_status; } + maybe_publish_local_view_completion(); LOG(INFO) << "materialize_view.local_canonical_rebuild artifact_id=" << request.canonical_artifact_id() << " view_id=" << view_id << " source_key=" << *selected_key; return build_local_view_handle(existing); @@ -1127,6 +1178,7 @@ absl::StatusOr MaterializationFacade::materialize_view_f if (!reuse_status.ok()) { return reuse_status; } + maybe_publish_local_view_completion(); LOG(INFO) << "materialize_view.local_canonical_raced artifact_id=" << request.canonical_artifact_id() << " view_id=" << view_id << " source_key=" << *selected_key; return build_local_view_handle(concurrent_replica); @@ -1135,6 +1187,7 @@ absl::StatusOr MaterializationFacade::materialize_view_f return emplace_status; } + maybe_publish_local_view_completion(); LOG(INFO) << "materialize_view.local_canonical_loaded artifact_id=" << request.canonical_artifact_id() << " view_id=" << view_id << " source_key=" << *selected_key << " source_location=" << static_cast(selected_location) << " view_bytes=" << local_map.total_bytes; @@ -1373,7 +1426,8 @@ absl::StatusOr MaterializationFacade::mate auto run_source = [&](std::unique_ptr loader, - loading::MaterializationSource source_kind) -> absl::StatusOr { + loading::MaterializationSource source_kind, + MappedSourceByteSpace source_byte_space) -> absl::StatusOr { auto init_status = loader->initialize(); if (!init_status.ok()) { return init_status; @@ -1383,18 +1437,28 @@ absl::StatusOr MaterializationFacade::mate return source_or.status(); } + const bool source_is_view = source_byte_space == MappedSourceByteSpace::kView; const bool use_source_layout = - (source_kind == loading::MaterializationSource::kDisk) && source_index_json.has_value(); - auto map_ptr_or = get_map_ptr(use_source_layout); - if (!map_ptr_or.ok()) { - return map_ptr_or.status(); + !source_is_view && (source_kind == loading::MaterializationSource::kDisk) && source_index_json.has_value(); + + loader::ByteRangeMap effective_map; + if (source_is_view) { + effective_map = build_identity_map(total_size); + } else { + auto map_ptr_or = get_map_ptr(use_source_layout); + if (!map_ptr_or.ok()) { + return map_ptr_or.status(); + } + effective_map = **map_ptr_or; } - auto map_ptr = *map_ptr_or; - loader::ByteRangeMap effective_map = *map_ptr; bool composed_view = false; - if (view_plan.has_value() && !view_plan->is_identity && use_source_layout) { - auto composed_or = loader::compose_byte_range_maps(view_plan->selection.map, *map_ptr); + if (!source_is_view && view_plan.has_value() && !view_plan->is_identity && use_source_layout) { + auto map_ptr_or = get_map_ptr(true); + if (!map_ptr_or.ok()) { + return map_ptr_or.status(); + } + auto composed_or = loader::compose_byte_range_maps(view_plan->selection.map, **map_ptr_or); if (!composed_or.ok()) { return composed_or.status(); } @@ -1424,7 +1488,7 @@ absl::StatusOr MaterializationFacade::mate return mapped_or.status(); } plan_source = std::move(*mapped_or); - if (view_plan.has_value() && !view_plan->is_identity && !composed_view) { + if (!source_is_view && view_plan.has_value() && !view_plan->is_identity && !composed_view) { plan_source = loader::make_view_plan_source(std::move(plan_source), view_plan->selection, config_.options->byte_mapping); } @@ -1560,7 +1624,10 @@ absl::StatusOr MaterializationFacade::mate if (prefer_disk && has_disk_source && allow_disk) { loading::DiskSource disk_src = *disk_source; disk_src.require_descriptor = tensorcast::common::is_mi2_artifact_id(hints.artifact_id); - auto disk_or = run_source(std::make_unique(disk_src), loading::MaterializationSource::kDisk); + auto disk_or = run_source( + std::make_unique(disk_src), + loading::MaterializationSource::kDisk, + MappedSourceByteSpace::kCanonical); if (disk_or.ok()) { return disk_or; } @@ -1618,6 +1685,7 @@ absl::StatusOr MaterializationFacade::mate p2p_src.buf_sizes = remote.buffer_sizes; p2p_src.verification_json = remote.verification_json; p2p_src.enable_checksum = false; + p2p_src.source_is_view = session.route_kind == tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW; p2p_src.location.type = remote.memory_type; p2p_src.location.device_id = remote.device_id; p2p_src.request_budget = hints.request_budget; @@ -1625,7 +1693,10 @@ absl::StatusOr MaterializationFacade::mate if (has_disk_source && allow_disk && !prefer_p2p) { p2p_src.fallback_disk_dir = disk_source->path.string(); } - auto p2p_or = run_source(std::make_unique(p2p_src), loading::MaterializationSource::kP2P); + auto p2p_or = run_source( + std::make_unique(p2p_src), + loading::MaterializationSource::kP2P, + p2p_src.source_is_view ? MappedSourceByteSpace::kView : MappedSourceByteSpace::kCanonical); auto complete_status = gs_client->complete_replica_transport( session.transport_id, p2p_or.ok() ? components::TransportCompletionOutcome::kSuccess @@ -1649,7 +1720,10 @@ absl::StatusOr MaterializationFacade::mate if (allow_disk && has_disk_source) { loading::DiskSource disk_src = *disk_source; disk_src.require_descriptor = tensorcast::common::is_mi2_artifact_id(hints.artifact_id); - return run_source(std::make_unique(disk_src), loading::MaterializationSource::kDisk); + return run_source( + std::make_unique(disk_src), + loading::MaterializationSource::kDisk, + MappedSourceByteSpace::kCanonical); } if (!allow_p2p && !allow_disk) { @@ -1734,25 +1808,6 @@ absl::StatusOr MaterializationFacade::mate ? std::optional(*hints.disk_metadata->source_index_json) : std::nullopt; - enum class MappedSourceByteSpace : uint8_t { kCanonical = 0, kView = 1 }; - - auto build_identity_map = [](uint64_t bytes) -> loader::ByteRangeMap { - loader::ByteRangeMap map; - map.total_bytes = bytes; - map.num_sources = 1; - if (bytes > 0) { - map.segments.push_back( - loader::ByteRangeSegment{ - .kind = loader::ByteRangeSegment::Kind::kData, - .dst_offset = 0, - .length = bytes, - .src_offset = 0, - .source_index = 0, - }); - } - return map; - }; - auto get_map_ptr = [&](bool use_source_layout) -> absl::StatusOr> { const std::string canonical_hash = index_hash(canonical_index_json); std::string plan_key = absl::StrCat(generation, ":canon:", canonical_hash); @@ -1804,8 +1859,10 @@ absl::StatusOr MaterializationFacade::mate const bool use_source_layout = !source_is_view && (source_kind == loading::MaterializationSource::kDisk) && source_index_json.has_value(); loader::ByteRangeMap effective_map = mapping; - if (source_is_view && hints.variant && hints.variant->view_id.has_value() && !view_plan.has_value()) { - // Opaque mapped view_id routes already expose destination byte space layout. + if (source_is_view) { + // Resident-view transport already streams bytes in destination/view byte space. + // Reapplying the view selection on the receiver would double-transform the payload + // and can drive the mapped source out of bounds. effective_map = build_identity_map(total_size); } if (!source_is_view && view_plan.has_value() && !view_plan->is_identity) { @@ -3212,7 +3269,8 @@ absl::StatusOr MaterializationFacade::ingest_from_disk( const loading::DiskSource& source, const loading::ReplicaTarget& target, const loading::MaterializeHints& hints) { - return run_disk_ingestion_internal(artifact_identifier, source, target, hints, /*publish_to_global_store=*/false); + const bool publish_to_global_store = hints.export_policy != loading::ExportPolicy::kNever; + return run_disk_ingestion_internal(artifact_identifier, source, target, hints, publish_to_global_store); } absl::StatusOr MaterializationFacade::ingest_from_disk( @@ -3229,7 +3287,8 @@ absl::StatusOr MaterializationFacade::ingest_from_p2p( const P2PSource& source, const loading::ReplicaTarget& target, const loading::MaterializeHints& hints) { - return run_p2p_ingestion_internal(artifact_identifier, source, target, hints, /*publish_to_global_store=*/false); + const bool publish_to_global_store = hints.export_policy != loading::ExportPolicy::kNever; + return run_p2p_ingestion_internal(artifact_identifier, source, target, hints, publish_to_global_store); } absl::StatusOr MaterializationFacade::ingest_from_p2p( diff --git a/core/store/testing/recording_global_store_client.h b/core/store/testing/recording_global_store_client.h index 38651e01b..8ba7a6cd4 100644 --- a/core/store/testing/recording_global_store_client.h +++ b/core/store/testing/recording_global_store_client.h @@ -631,12 +631,14 @@ class RecordingGlobalStoreClient final : public components::IGlobalStoreClient { session.remote_replica.node_id = "stub-node"; session.remote_replica.node_address = "127.0.0.1"; session.remote_replica.node_port = 12345; + session.remote_replica.grpc_port = 12346; session.remote_replica.memory_size = 16; session.remote_replica.memory_type = common::memory::MemoryLocation::CPU; session.remote_replica.device_id = target_device.ordinal; session.remote_replica.remote_memory_keys = {"tensor.data_0"}; session.remote_replica.buffer_sizes = {16}; session.remote_replica.verification_json = "{}"; + session.route_kind = tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_CANONICAL; return session; } @@ -648,12 +650,19 @@ class RecordingGlobalStoreClient final : public components::IGlobalStoreClient { session.remote_replica.node_id = remote_node_id; session.remote_replica.node_address = remote_node_address; session.remote_replica.node_port = remote_node_port; + session.remote_replica.grpc_port = remote_node_port + 1; session.remote_replica.memory_size = info.memory_size; session.remote_replica.memory_type = info.memory_type; session.remote_replica.device_id = info.device_id; session.remote_replica.remote_memory_keys = info.remote_memory_keys; session.remote_replica.buffer_sizes = info.buffer_sizes; session.remote_replica.verification_json = "{}"; + session.route_kind = info.view_id.has_value() ? tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_RESIDENT_VIEW + : tensorcast::global_store::v1::TRANSPORT_ROUTE_KIND_CANONICAL; + if (info.view_id.has_value()) { + session.view_transport_metadata = components::ViewTransportMetadata{ + .view_id = *info.view_id, .view_size_bytes = info.memory_size, .view_data_hash = std::nullopt}; + } return session; } }; diff --git a/daemon/BUILD b/daemon/BUILD index 2f6bcc01a..0f773aeb6 100644 --- a/daemon/BUILD +++ b/daemon/BUILD @@ -594,6 +594,22 @@ sc_cc_library( ], ) +sc_cc_library( + name = "derived_view_export_manager_lib", + srcs = ["state/derived_view_export_manager.cc"], + hdrs = ["state/derived_view_export_manager.h"], + deps = [ + ":session_lifecycle_lib", + ":types_hdr", + "//core/store:global_store_client", + "//core/store:store_engine", + "//core/store/materialization/contracts:loading_spec", + "@abseil-cpp//absl/container:flat_hash_map", + "@abseil-cpp//absl/synchronization", + "@abseil-cpp//absl/time", + ], +) + sc_cc_library( name = "session_lifecycle_lib", srcs = ["state/session_lifecycle.cc"], @@ -658,6 +674,7 @@ sc_cc_library( ":artifact_source_registry_lib", ":background_scheduler_hdr", ":daemon_options_hdr", + ":derived_view_export_manager_lib", ":device_resolver_hdr", ":handle_lease_registry_lib", ":ipc_region_registry_lib", @@ -834,6 +851,7 @@ sc_cc_library( ":artifact_source_registry_lib", ":assembly_operation_service_lib", ":daemon_options_hdr", + ":derived_view_export_manager_lib", ":device_resolver_hdr", ":disk_artifact_service_lib", ":handle_lease_registry_lib", @@ -911,6 +929,7 @@ sc_cc_library( deps = [ ":artifact_source_registry_lib", ":daemon_options_hdr", + ":derived_view_export_manager_lib", ":device_resolver_hdr", ":grpc_peer_utils_lib", ":handle_lease_registry_lib", @@ -1317,6 +1336,7 @@ sc_cc_library( srcs = ["service/controllers/transport_controller.cc"], hdrs = ["service/controllers/transport_controller.h"], deps = [ + ":derived_view_export_manager_lib", ":lip_manager_lib", ":rpc_context_hdr", ":status_utils", @@ -1363,6 +1383,38 @@ cc_test( ], ) +cc_test( + name = "derived_view_export_manager_test", + srcs = ["state/derived_view_export_manager_test.cc"], + deps = [ + ":derived_view_export_manager_lib", + ":lip_manager_lib", + ":ref_tracker_hdr", + ":session_lifecycle_lib", + "//core/store:device_registry", + "//core/store:store_engine", + "//core/store:testing_global_store_client_stub", + "//core/store/materialization/dataplane:view_identity", + "@catch2//:catch2_main", + ], +) + +cc_test( + name = "derived_view_export_manager_eviction_test", + srcs = ["state/derived_view_export_manager_eviction_test.cc"], + deps = [ + ":derived_view_export_manager_lib", + ":lip_manager_lib", + ":ref_tracker_hdr", + ":session_lifecycle_lib", + "//core/store:device_registry", + "//core/store:store_engine", + "//core/store:testing_global_store_client_stub", + "//core/store/materialization/dataplane:view_identity", + "@catch2//:catch2_main", + ], +) + cc_test( name = "session_lifecycle_immediate_unload_test", srcs = ["state/session_lifecycle_immediate_unload_test.cc"], diff --git a/daemon/app/daemon_app.cc b/daemon/app/daemon_app.cc index eef35af95..3dc6cb61e 100644 --- a/daemon/app/daemon_app.cc +++ b/daemon/app/daemon_app.cc @@ -164,6 +164,7 @@ absl::StatusOr> DaemonApp::create(Options options) { } if (app->options_.global_store_client) { app->kernel_->lip_manager().set_global_store_client(app->options_.global_store_client); + app->kernel_->derived_view_export_manager().set_global_store_client(app->options_.global_store_client); } MaterializationController::Dep mdep{ @@ -181,6 +182,7 @@ absl::StatusOr> DaemonApp::create(Options options) { .global_store_client = app->options_.global_store_client, .max_concurrency = app->options_.daemon_options.max_concurrency, .lifecycle = &app->kernel_->lifecycle_manager(), + .derived_view_exports = &app->kernel_->derived_view_export_manager(), .handle_leases = app->kernel_->handle_leases(), .capability_tokens = app->kernel_->capability_tokens(), .cpu_shared_memory_enabled = app->options_.daemon_options.cpu_shared_memory_enabled, @@ -207,7 +209,9 @@ absl::StatusOr> DaemonApp::create(Options options) { TransportController::Dep tdep{ .engine = app->kernel_->engine(), .locks = app->kernel_->transport_lock_manager(), - .lip = app->kernel_->lip_manager()}; + .lip = app->kernel_->lip_manager(), + .derived_view_exports = &app->kernel_->derived_view_export_manager(), + }; app->transport_controller_ = std::make_unique(tdep); StatusController::Dep sdep{ diff --git a/daemon/app/server_main.cc b/daemon/app/server_main.cc index 1cd970adc..a36a402cf 100644 --- a/daemon/app/server_main.cc +++ b/daemon/app/server_main.cc @@ -570,6 +570,8 @@ int main(int argc, char** argv) { cc.slice_bytes = cls.slice_bytes(); cc.pool_bytes = cls.pool_bytes(); cc.rdma_preregister = cls.rdma_preregister(); + cc.numa_node = cls.numa_node(); + cc.numa_prefault = cls.numa_prefault(); pinned_total_bytes += cls.pool_bytes(); pm_cfg.classes.push_back(std::move(cc)); } @@ -999,6 +1001,15 @@ int main(int argc, char** argv) { daemon_opts.handle_lease_ttl = duration_to_millis(cfg.lifecycle().handle_leases().ttl()); } daemon_opts.handle_lease_max_mints_per_second = cfg.lifecycle().handle_leases().max_mints_per_second(); + if (cfg.lifecycle().has_derived_view_exports()) { + const auto& derived_cfg = cfg.lifecycle().derived_view_exports(); + if (derived_cfg.has_ttl()) { + daemon_opts.derived_view_exports.ttl = duration_to_millis(derived_cfg.ttl()); + } + if (derived_cfg.has_retry_retire_ttl()) { + daemon_opts.derived_view_exports.retry_retire_ttl = duration_to_millis(derived_cfg.retry_retire_ttl()); + } + } daemon_opts.cpu_shared_memory_enabled = opts.cpu_shared_memory_enabled; daemon_opts.external_target_verification_enabled = cfg.engine().enable_external_target_verification(); daemon_opts.max_concurrency = std::max(1, opts.promotion.max_concurrency); @@ -1109,10 +1120,16 @@ int main(int argc, char** argv) { grpc_opts.keepalive_timeout_ms = to_ms(cfg.server().grpc().keepalive_timeout()); } if (cfg.server().grpc().has_max_connection_idle()) { - grpc_opts.max_connection_idle_ms = to_ms(cfg.server().grpc().max_connection_idle()); + const int max_connection_idle_ms = to_ms(cfg.server().grpc().max_connection_idle()); + if (max_connection_idle_ms > 0) { + grpc_opts.max_connection_idle_ms = max_connection_idle_ms; + } } if (cfg.server().grpc().has_max_connection_age()) { - grpc_opts.max_connection_age_ms = to_ms(cfg.server().grpc().max_connection_age()); + const int max_connection_age_ms = to_ms(cfg.server().grpc().max_connection_age()); + if (max_connection_age_ms > 0) { + grpc_opts.max_connection_age_ms = max_connection_age_ms; + } } grpc_opts.tcp_nodelay = cfg.server().grpc().tcp_nodelay(); grpc_opts.so_reuseport = cfg.server().grpc().so_reuseport(); diff --git a/daemon/ha/worker_lifecycle_manager.cc b/daemon/ha/worker_lifecycle_manager.cc index 2fdffa729..0c15d15fb 100644 --- a/daemon/ha/worker_lifecycle_manager.cc +++ b/daemon/ha/worker_lifecycle_manager.cc @@ -1311,28 +1311,41 @@ void WorkerLifecycleManager::perform_state_sync(uint64_t epoch) { case global_store::StateChange::CHANGE_TYPE_ADD_REPLICA: { // Proactively materialize the replica locally on the indicated memory. const auto& ri = ch.replica_info(); - store::DeviceKey dev{.type = DeviceType::CPU, .ordinal = -1, .uuid = ""}; - if (ri.memory_info().memory_type() == commonpb::MEMORY_TYPE_GPU) { - dev = store::DeviceRegistry::instance().gpu_key(static_cast(ri.memory_info().device_id())); - } else if (ri.memory_info().memory_type() == commonpb::MEMORY_TYPE_RAM) { - dev = store::DeviceKey{.type = DeviceType::CPU, .ordinal = -1, .uuid = ""}; - } else { + auto selector_opt = replica_selector_from_memory_info(ri.ref().artifact_id(), ri.memory_info()); + if (!selector_opt.has_value()) { // Ignore DISK-only add in daemon prefetch. break; } - std::string artifact_id = ri.ref().artifact_id(); + const ReplicaSelector& selector = *selector_opt; + store::DeviceKey dev{.type = DeviceType::CPU, .ordinal = -1, .uuid = ""}; + if (selector.memory_type == commonpb::MEMORY_TYPE_GPU) { + dev = store::DeviceRegistry::instance().gpu_key(selector.device_id); + } + std::string artifact_id = selector.artifact_id; + std::optional view_id = + selector.view_id.empty() ? std::nullopt : std::optional(selector.view_id); auto engine = engine_.get(); auto& async_runtime = ports_.async_runtime; if (async_runtime.is_shutting_down()) { break; } auto executor = async_runtime.blocking_executor(); - executor->add([engine = std::move(engine), dev, artifact_id = std::move(artifact_id)]() mutable { + executor->add([engine = std::move(engine), + dev, + artifact_id = std::move(artifact_id), + view_id = std::move(view_id)]() mutable { store::loading::MaterializeHints hints; hints.artifact_id = artifact_id; + if (view_id.has_value()) { + store::loading::VariantIdentity variant; + variant.canonical_artifact_id = artifact_id; + variant.view_id = *view_id; + hints.variant = std::move(variant); + } auto res = engine->materialize_replica(dev, store::StoreEngine::MaterializeMode::AUTO, hints); if (!res.ok()) { VLOG(1) << "Prefetch materialize_replica failed: artifact_id=" << artifact_id + << (view_id.has_value() ? absl::StrCat(" view_id=", *view_id) : std::string()) << " dev=" << dev.to_string() << ": " << res.status(); } }); diff --git a/daemon/service/controllers/materialization_controller.cc b/daemon/service/controllers/materialization_controller.cc index 3066d2f2b..a7f8b6eed 100644 --- a/daemon/service/controllers/materialization_controller.cc +++ b/daemon/service/controllers/materialization_controller.cc @@ -33,8 +33,10 @@ MaterializationController::MaterializationController(Dep d) .devices = d.devices, .disk_imports = d.disk_imports, .shutdown_signal = d.shutdown_signal, + .async_runtime = &d.async_runtime, .global_store_client = d.global_store_client, .lifecycle = d.lifecycle, + .derived_view_exports = d.derived_view_exports, .handle_leases = d.handle_leases, .cpu_shared_memory_enabled = d.cpu_shared_memory_enabled, .post_seal_policy = d.post_seal_policy, diff --git a/daemon/service/controllers/materialization_controller.h b/daemon/service/controllers/materialization_controller.h index 05e4e42bd..0ec08c6ea 100644 --- a/daemon/service/controllers/materialization_controller.h +++ b/daemon/service/controllers/materialization_controller.h @@ -19,6 +19,7 @@ #include "daemon/service/rpc_context.h" #include "daemon/state/artifact_source_registry.h" #include "daemon/state/daemon_options.h" +#include "daemon/state/derived_view_export_manager.h" #include "daemon/state/device_resolver.h" #include "daemon/state/handle_lease_registry.h" #include "daemon/state/ipc_region_registry.h" @@ -50,6 +51,7 @@ class MaterializationController { std::shared_ptr global_store_client; uint32_t max_concurrency{4}; SessionLifecycleManager* lifecycle{nullptr}; + DerivedViewExportManager* derived_view_exports{nullptr}; HandleLeaseRegistry* handle_leases{nullptr}; common::CapabilityTokenManager* capability_tokens{nullptr}; bool cpu_shared_memory_enabled{true}; diff --git a/daemon/service/controllers/materialization_request_common_utils.cc b/daemon/service/controllers/materialization_request_common_utils.cc index 5e1564834..3f0c51e8a 100644 --- a/daemon/service/controllers/materialization_request_common_utils.cc +++ b/daemon/service/controllers/materialization_request_common_utils.cc @@ -238,6 +238,9 @@ absl::StatusOr validate_and_compute_lease_context( if (!cpu_target) { return lease_context; } + if (lease_context.no_lease) { + return lease_context; + } if (!lease_context.loopback_peer) { return absl::PermissionDeniedError("CPU shared-memory materialization is local-only"); } diff --git a/daemon/service/controllers/replica_materialization_service.cc b/daemon/service/controllers/replica_materialization_service.cc index 76b3647f8..5d670ef89 100644 --- a/daemon/service/controllers/replica_materialization_service.cc +++ b/daemon/service/controllers/replica_materialization_service.cc @@ -13,9 +13,11 @@ #include #include +#include "absl/cleanup/cleanup.h" #include "absl/status/status.h" #include "absl/status/statusor.h" #include "absl/strings/escaping.h" +#include "absl/strings/match.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/types/span.h" @@ -60,6 +62,7 @@ using materialization_policy::to_hint_preference; using materialization_policy::validate_source_policy; using materialization_post_seal::check_post_seal_view_reuse_safe; using materialization_replica_handle::bind_replica_handle_for_response; +using materialization_replica_handle::register_session_and_refs; using materialization_request_common::LeaseContext; using materialization_request_common::LipFastPathRequest; using materialization_request_common::materialize_with_shared_disk_retry; @@ -73,6 +76,19 @@ using store::loader::ViewSpec; using store::loading::MaterializationSource; +constexpr std::string_view kDerivedViewExportReplicaPrefix = "derived-view-export:"; + +uint64_t compute_derived_view_export_reservation_bytes(uint64_t view_size_bytes) { + // Source-side derived view export transiently overlaps: + // 1. newly materialized view bytes, + // 2. still-live older derived views that are only retired at reservation boundaries, + // 3. chunk/export staging that exists before the final resident bytes settle. + // + // Reserving at 1.5x keeps first-time exports admissible while forcing later + // generations to evict more stale views before materialization starts. + return view_size_bytes + (view_size_bytes / 2); +} + void record_lease_create_failed() { try { static auto meter = opentelemetry::metrics::Provider::GetMeterProvider()->GetMeter("tensorcast.daemon", "1.0.0"); @@ -192,29 +208,97 @@ absl::StatusOr retry_materialize_from_shared_disk prepare_retry_disk_source); } -std::chrono::milliseconds resolve_materialization_request_budget( +struct MaterializationBudgetInfo { + std::chrono::milliseconds request_budget{0}; + std::optional grpc_deadline_remaining; + std::chrono::milliseconds pinned_timeout{0}; +}; + +MaterializationBudgetInfo resolve_materialization_budget_info( const grpc::ServerContext& server_context, const v2::MaterializeReplicaRequest& req) { using clock = std::chrono::system_clock; constexpr std::chrono::milliseconds kDefaultBudget{600000}; - constexpr std::chrono::milliseconds kHardCap{1800000}; - const std::chrono::milliseconds requested_budget = req.pinned_allocation_timeout_ms() > 0 + MaterializationBudgetInfo out; + out.pinned_timeout = req.pinned_allocation_timeout_ms() > 0 ? std::chrono::milliseconds(req.pinned_allocation_timeout_ms()) - : kDefaultBudget; - std::chrono::milliseconds effective_budget = std::min(requested_budget, kHardCap); + : std::chrono::milliseconds(0); const auto grpc_deadline = server_context.deadline(); if (grpc_deadline != clock::time_point::max()) { const auto now = clock::now(); if (grpc_deadline <= now) { - return std::chrono::milliseconds(0); + out.grpc_deadline_remaining = std::chrono::milliseconds(0); + out.request_budget = std::chrono::milliseconds(0); + return out; } const auto remaining = std::chrono::duration_cast(grpc_deadline - now); if (remaining.count() <= 0) { - return std::chrono::milliseconds(1); + out.grpc_deadline_remaining = std::chrono::milliseconds(1); + out.request_budget = std::chrono::milliseconds(1); + return out; + } + out.grpc_deadline_remaining = remaining; + out.request_budget = remaining; + return out; + } + out.request_budget = kDefaultBudget; + return out; +} + +absl::Status retry_post_seal_view_reuse_if_needed( + store::StoreEngine& engine, + const std::shared_ptr& global_store_client, + const DaemonOptions::PostSealPolicy& post_seal_policy, + const store::DeviceKey& dev, + store::StoreEngine::MaterializeMode mode, + bool view_requested, + const std::optional& fallback_artifact_id, + std::string_view resolved_artifact_id, + const std::optional& disk_source_artifact_id, + const std::optional& current_disk_source, + store::loading::MaterializeHints* hints_for_retry, + absl::StatusOr* result_for_retry) { + if (result_for_retry->ok() || !view_requested || !fallback_artifact_id.has_value() || + !absl::IsNotFound(result_for_retry->status())) { + return absl::OkStatus(); + } + + bool allow_reuse = false; + if (post_seal_policy.reuse_views_if_safe) { + if (!global_store_client || !global_store_client->is_connected()) { + return absl::FailedPreconditionError("GlobalStoreClient not connected"); } - effective_budget = std::min(effective_budget, remaining); + auto safe_or = check_post_seal_view_reuse_safe(*global_store_client, *fallback_artifact_id, resolved_artifact_id); + if (!safe_or.ok()) { + LOG(WARNING) << "post-seal view reuse check failed for assembly=" << *fallback_artifact_id + << " mi2=" << resolved_artifact_id << ": " << safe_or.status(); + return safe_or.status(); + } + allow_reuse = *safe_or; + if (!allow_reuse) { + LOG(WARNING) << "post-seal view reuse disabled: proof commitments mismatch for assembly=" << *fallback_artifact_id + << " mi2=" << resolved_artifact_id; + } + } + + if (!allow_reuse) { + return absl::OkStatus(); + } + + hints_for_retry->artifact_id = *fallback_artifact_id; + if (hints_for_retry->variant.has_value()) { + hints_for_retry->variant->canonical_artifact_id = *fallback_artifact_id; + } + std::optional fallback_disk_source; + if (disk_source_artifact_id.has_value() && *disk_source_artifact_id == *fallback_artifact_id) { + fallback_disk_source = current_disk_source; } - return effective_budget.count() > 0 ? effective_budget : std::chrono::milliseconds(1); + auto fallback_or = engine.materialize_replica(dev, mode, *hints_for_retry, fallback_disk_source); + if (!fallback_or.ok()) { + return fallback_or.status(); + } + *result_for_retry = std::move(fallback_or); + return absl::OkStatus(); } } // namespace @@ -648,7 +732,8 @@ grpc::Status ReplicaMaterializationService::materialize_replica( if (req.pinned_allocation_timeout_ms() > 0) { hints.pinned_timeout = std::chrono::milliseconds(req.pinned_allocation_timeout_ms()); } - const std::chrono::milliseconds request_budget = resolve_materialization_request_budget(rctx.server_context(), req); + const MaterializationBudgetInfo budget_info = resolve_materialization_budget_info(rctx.server_context(), req); + const std::chrono::milliseconds request_budget = budget_info.request_budget; hints.request_budget = request_budget; hints.transport_wait_timeout = request_budget; hints.verify = verify_checksums ? store::loading::MaterializeHints::Verify::CHECKSUM @@ -661,6 +746,18 @@ grpc::Status ReplicaMaterializationService::materialize_replica( } hints.export_policy = to_hint_export_policy(req.export_policy()); hints.need_view_data_hash = req.has_need_view_data_hash() ? req.need_view_data_hash() : true; + LOG(INFO) << "event=materialize_budget" + << " artifact_id=" << resolved_artifact_id + << " view_id=" << (resolved_view_id.has_value() ? *resolved_view_id : std::string()) + << " pinned_timeout_ms=" << budget_info.pinned_timeout.count() << " grpc_deadline_remaining_ms=" + << (budget_info.grpc_deadline_remaining.has_value() + ? std::to_string(budget_info.grpc_deadline_remaining->count()) + : std::string("unbounded")) + << " request_budget_ms=" << request_budget.count() + << " transport_wait_timeout_ms=" << hints.transport_wait_timeout.count() + << " source_preference=" << static_cast(hints.source_preference) + << " allow_p2p=" << (hints.allow_p2p ? "true" : "false") + << " allow_disk=" << (hints.allow_disk ? "true" : "false"); if (has_artifact) hints.artifact_id = resolved_artifact_id; if (disk_metadata.has_value()) { @@ -711,46 +808,176 @@ grpc::Status ReplicaMaterializationService::materialize_replica( const auto mode = (has_disk && !has_artifact && !prefer_disk) ? store::StoreEngine::MaterializeMode::LOAD_ONLY : store::StoreEngine::MaterializeMode::AUTO; - auto result = d_.engine.materialize_replica(dev, mode, hints, disk_source); - if (!result.ok() && view_requested && fallback_artifact_id.has_value() && absl::IsNotFound(result.status())) { - bool allow_reuse = false; - if (d_.post_seal_policy.reuse_views_if_safe) { - if (!d_.global_store_client || !d_.global_store_client->is_connected()) { - resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_FAILED); - return {StatusCode::FAILED_PRECONDITION, "GlobalStoreClient not connected"}; - } - auto safe_or = - check_post_seal_view_reuse_safe(*d_.global_store_client, *fallback_artifact_id, resolved_artifact_id); - if (!safe_or.ok()) { - LOG(WARNING) << "post-seal view reuse check failed for assembly=" << *fallback_artifact_id - << " mi2=" << resolved_artifact_id << ": " << safe_or.status(); - resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_FAILED); - return to_grpc_status(safe_or.status()); - } - allow_reuse = *safe_or; - if (!allow_reuse) { - LOG(WARNING) << "post-seal view reuse disabled: proof commitments mismatch for assembly=" - << *fallback_artifact_id << " mi2=" << resolved_artifact_id; + const bool async_prepare_fast_path = cpu_target && no_lease && !req.wait_for_completion() && + !req.replica_uuid().empty() && hints.variant.has_value() && d_.async_runtime != nullptr && !has_disk; + const bool manage_derived_view_export = async_prepare_fast_path && d_.derived_view_exports != nullptr && + absl::StartsWith(req.replica_uuid(), kDerivedViewExportReplicaPrefix); + std::optional derived_view_export_reserved_bytes; + if (manage_derived_view_export) { + if (view_plan.has_value() && !view_plan->is_identity && view_plan->view_size_bytes > 0) { + derived_view_export_reserved_bytes = compute_derived_view_export_reservation_bytes(view_plan->view_size_bytes); + } else if (resolved_view_id.has_value()) { + auto view_metadata_or = d_.engine.get_view_metadata(resolved_artifact_id, *resolved_view_id); + if (view_metadata_or.ok() && view_metadata_or->view_size_bytes > 0) { + derived_view_export_reserved_bytes = + compute_derived_view_export_reservation_bytes(view_metadata_or->view_size_bytes); + } else if (!view_metadata_or.ok()) { + LOG(WARNING) << "Derived view export reservation size lookup failed: artifact_id=" << resolved_artifact_id + << " view_id=" << *resolved_view_id << " status=" << view_metadata_or.status(); } } + } + if (async_prepare_fast_path) { + auto ready_signal = std::make_shared>(); + store::loading::ReplicaKey pending_key{ + .artifact_id = resolved_artifact_id, + .view_id = hints.variant->view_id, + .device = dev, + .replica = 0, + }; + auto session_status = register_session_and_refs( + d_.sessions, + d_.refs, + pending_key, + ready_signal, + req.replica_uuid(), + /*pid=*/0, + /*allow_pid_ref=*/false); + if (!session_status.ok()) { + resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_FAILED); + return to_grpc_status(session_status); + } - if (allow_reuse) { - hints.artifact_id = *fallback_artifact_id; - if (hints.variant.has_value()) { - hints.variant->canonical_artifact_id = *fallback_artifact_id; + auto executor = d_.async_runtime->blocking_executor(); + auto hints_for_async = hints; + const auto fallback_artifact_id_for_async = fallback_artifact_id; + const auto disk_source_for_async = disk_source; + const auto replica_uuid = req.replica_uuid(); + executor->add([this, + ready_signal, + replica_uuid, + manage_derived_view_export, + derived_view_export_reserved_bytes, + dev, + mode, + hints_for_async = std::move(hints_for_async), + disk_source_for_async, + fallback_artifact_id_for_async, + disk_source_artifact_id_for_async = disk_source_artifact_id, + resolved_artifact_id, + pending_key]() mutable { + bool reserved_derived_view_export = false; + bool acquired_prepare_budget = false; + if (manage_derived_view_export && derived_view_export_reserved_bytes.has_value() && + *derived_view_export_reserved_bytes > 0) { + const absl::Status reserve_status = + d_.derived_view_exports->reserve(pending_key, *derived_view_export_reserved_bytes); + if (!reserve_status.ok()) { + ready_signal->set_value(reserve_status); + LOG(WARNING) << "Async NO_LEASE derived view reservation failed: replica_uuid=" << replica_uuid + << " artifact_id=" << resolved_artifact_id << " status=" << reserve_status; + return; + } + reserved_derived_view_export = true; + const absl::Status prepare_budget_status = + d_.derived_view_exports->acquire_prepare_budget(*derived_view_export_reserved_bytes); + if (!prepare_budget_status.ok()) { + d_.derived_view_exports->cancel_reserved(pending_key).IgnoreError(); + ready_signal->set_value(prepare_budget_status); + LOG(WARNING) << "Async NO_LEASE derived view prepare-budget wait failed: replica_uuid=" << replica_uuid + << " artifact_id=" << resolved_artifact_id << " status=" << prepare_budget_status; + return; + } + acquired_prepare_budget = true; } - std::optional fallback_disk_source; - if (disk_source_artifact_id.has_value() && *disk_source_artifact_id == *fallback_artifact_id) { - fallback_disk_source = disk_source; + auto prepare_budget_guard = absl::MakeCleanup([&]() { + if (acquired_prepare_budget) { + d_.derived_view_exports->release_prepare_budget(*derived_view_export_reserved_bytes); + } + }); + absl::StatusOr async_result = + d_.engine.materialize_replica(dev, mode, hints_for_async, disk_source_for_async); + const absl::Status retry_status = retry_post_seal_view_reuse_if_needed( + d_.engine, + d_.global_store_client, + d_.post_seal_policy, + dev, + mode, + /*view_requested=*/true, + fallback_artifact_id_for_async, + resolved_artifact_id, + disk_source_artifact_id_for_async, + disk_source_for_async, + &hints_for_async, + &async_result); + if (!retry_status.ok()) { + if (reserved_derived_view_export) { + d_.derived_view_exports->cancel_reserved(pending_key).IgnoreError(); + } + ready_signal->set_value(retry_status); + LOG(WARNING) << "Async NO_LEASE materialize failed during post-seal reuse retry: replica_uuid=" << replica_uuid + << " artifact_id=" << resolved_artifact_id << " status=" << retry_status; + return; + } + const absl::Status final_status = async_result.ok() ? absl::OkStatus() : async_result.status(); + if (final_status.ok() && manage_derived_view_export) { + const absl::Status retain_status = reserved_derived_view_export + ? d_.derived_view_exports->commit_reserved(async_result->replica_key) + : d_.derived_view_exports->retain_or_refresh(async_result->replica_key); + if (!retain_status.ok()) { + if (reserved_derived_view_export) { + d_.derived_view_exports->cancel_reserved(async_result->replica_key).IgnoreError(); + } + ready_signal->set_value(retain_status); + LOG(WARNING) << "Async NO_LEASE derived view retention failed: replica_uuid=" << replica_uuid + << " artifact_id=" << resolved_artifact_id << " status=" << retain_status; + return; + } + } else if (!final_status.ok() && reserved_derived_view_export) { + d_.derived_view_exports->cancel_reserved(pending_key).IgnoreError(); } - auto fallback_or = d_.engine.materialize_replica(dev, mode, hints, fallback_disk_source); - if (fallback_or.ok()) { - result = std::move(fallback_or); + ready_signal->set_value(final_status); + if (!final_status.ok()) { + LOG(WARNING) << "Async NO_LEASE materialize failed: replica_uuid=" << replica_uuid + << " artifact_id=" << resolved_artifact_id << " status=" << final_status; } else { - resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_FAILED); - return to_grpc_status(fallback_or.status()); + VLOG(1) << "Async NO_LEASE materialize completed: replica_uuid=" << replica_uuid + << " artifact_id=" << resolved_artifact_id; } + }); + + resp.clear_mem_handle(); + resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_ALLOCATED); + resp.set_source(v2::MaterializationSource::MATERIALIZATION_SOURCE_UNSPECIFIED); + span->SetAttribute("tc.store.source", static_cast(resp.source())); + if (view_plan.has_value() && !view_plan->is_identity) { + resp.set_view_index_json(view_plan->view_index_json); } + VLOG(1) << "MaterializationController: fast-return async NO_LEASE materialize accepted for replica_uuid=" + << req.replica_uuid() << " artifact_id=" << resolved_artifact_id << " view_id=" + << (resolved_view_id.has_value() + ? *resolved_view_id + : (hints.variant && hints.variant->view_id ? *hints.variant->view_id : std::string(""))); + return finalize_response(); + } + + auto result = d_.engine.materialize_replica(dev, mode, hints, disk_source); + const absl::Status post_seal_retry_status = retry_post_seal_view_reuse_if_needed( + d_.engine, + d_.global_store_client, + d_.post_seal_policy, + dev, + mode, + view_requested, + fallback_artifact_id, + resolved_artifact_id, + disk_source_artifact_id, + disk_source, + &hints, + &result); + if (!post_seal_retry_status.ok()) { + resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_FAILED); + return to_grpc_status(post_seal_retry_status); } if (!result.ok()) { auto retry_or = retry_materialize_from_shared_disk( @@ -851,6 +1078,36 @@ grpc::Status ReplicaMaterializationService::materialize_replica( << handle.replica_key << " cpu_state=" << static_cast(handle.cpu_state) << " gpu_state=" << static_cast(handle.gpu_state); } + if (no_lease) { + auto session_status = register_session_and_refs( + d_.sessions, + d_.refs, + handle.replica_key, + handle.ready_signal, + req.replica_uuid(), + /*pid=*/0, + /*allow_pid_ref=*/false); + if (!session_status.ok()) { + resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_FAILED); + return to_grpc_status(session_status); + } + resp.clear_mem_handle(); + resp.set_status(MaterializeReplicaStatus::MATERIALIZE_REPLICA_STATUS_ALLOCATED); + if (handle.view_index_json.has_value()) { + resp.set_view_index_json(*handle.view_index_json); + } + if (resp.view_index_json().empty() && normalized_disk_path.has_value()) { + if (disk_index.has_value()) { + resp.set_view_index_json(disk_index->canonical_index_json); + } else { + auto local_index_or = store::loader::read_from_artifact_dir(*normalized_disk_path, dev.ordinal); + if (local_index_or.ok()) { + resp.set_view_index_json(local_index_or->canonical_index_json); + } + } + } + return finalize_response(); + } auto bind_status = bind_materialized_handle( d_.engine, d_.sessions, diff --git a/daemon/service/controllers/replica_materialization_service.h b/daemon/service/controllers/replica_materialization_service.h index e547f257a..2518e5e5c 100644 --- a/daemon/service/controllers/replica_materialization_service.h +++ b/daemon/service/controllers/replica_materialization_service.h @@ -5,11 +5,13 @@ #include #include +#include "core/common/async_runtime.h" #include "core/store/components/global_store_client.h" #include "core/store/store_engine.h" #include "daemon/service/rpc_context.h" #include "daemon/state/artifact_source_registry.h" #include "daemon/state/daemon_options.h" +#include "daemon/state/derived_view_export_manager.h" #include "daemon/state/device_resolver.h" #include "daemon/state/handle_lease_registry.h" #include "daemon/state/lip_bridge.h" @@ -31,8 +33,10 @@ class ReplicaMaterializationService { DeviceResolver& devices; ArtifactSourceRegistry& disk_imports; ShutdownSignal& shutdown_signal; + common::AsyncRuntime* async_runtime{nullptr}; std::shared_ptr global_store_client; SessionLifecycleManager* lifecycle{nullptr}; + DerivedViewExportManager* derived_view_exports{nullptr}; HandleLeaseRegistry* handle_leases{nullptr}; bool cpu_shared_memory_enabled{true}; DaemonOptions::PostSealPolicy post_seal_policy{}; diff --git a/daemon/service/controllers/transport_controller.cc b/daemon/service/controllers/transport_controller.cc index 926c2afa7..9994c5ae3 100644 --- a/daemon/service/controllers/transport_controller.cc +++ b/daemon/service/controllers/transport_controller.cc @@ -93,6 +93,13 @@ grpc::Status TransportController::lock( std::string token = d_.locks.mint_token(); d_.locks.put(token, key, std::move(indices)); resp.set_lock_token(token); + if (requested_view_id.has_value() && d_.derived_view_exports != nullptr) { + absl::Status begin_fetch_status = d_.derived_view_exports->begin_fetch(key, token); + if (!absl::IsNotFound(begin_fetch_status) && !begin_fetch_status.ok()) { + d_.locks.erase(token); + return to_grpc_status(begin_fetch_status); + } + } rctx.mark_success(); return Status::OK; } @@ -117,7 +124,82 @@ grpc::Status TransportController::unlock( return Status::OK; } // UMA V3: No engine-level unlock; treat daemon unlock as idempotent bookkeeping: just erase the token. - d_.locks.erase(req.lock_token()); + auto removed = d_.locks.take(req.lock_token()); + if (removed.has_value() && removed->key.view_id.has_value() && d_.derived_view_exports != nullptr) { + d_.derived_view_exports->end_fetch(req.lock_token(), "unlock"); + } + rctx.mark_success(); + return Status::OK; +} + +grpc::Status TransportController::begin_replica_fetch( + RpcContext& rctx, + const v2::BeginReplicaFetchRequest& req, + v2::BeginReplicaFetchResponse& resp) { + auto& span = rctx.span(); + span->SetAttribute("tc.artifact.id", req.artifact_id()); + span->SetAttribute("tc.transport.id", req.transport_id()); + if (req.view_id().empty()) { + return {grpc::StatusCode::INVALID_ARGUMENT, "view_id is required"}; + } + if (req.transport_id().empty()) { + return {grpc::StatusCode::INVALID_ARGUMENT, "transport_id is required"}; + } + if (d_.derived_view_exports == nullptr) { + resp.set_managed(false); + rctx.mark_success(); + return Status::OK; + } + + store::loading::ReplicaKey key; + key.artifact_id = req.artifact_id(); + key.view_id = req.view_id(); + key.replica = 0; + switch (req.device_type()) { + case v2::DEVICE_TYPE_CPU: + key.device = store::DeviceKey{.type = DeviceType::CPU, .ordinal = -1}; + break; + case v2::DEVICE_TYPE_GPU: + if (!req.has_device_id()) { + return {grpc::StatusCode::INVALID_ARGUMENT, "GPU fetch requires device_id"}; + } + key.device = store::DeviceRegistry::instance().gpu_key(req.device_id()); + break; + default: + return {grpc::StatusCode::INVALID_ARGUMENT, "unsupported device_type"}; + } + + const absl::Status begin_status = d_.derived_view_exports->begin_fetch(key, req.transport_id()); + if (absl::IsNotFound(begin_status)) { + resp.set_managed(false); + rctx.mark_success(); + return Status::OK; + } + if (!begin_status.ok()) { + return to_grpc_status(begin_status); + } + resp.set_managed(true); + rctx.mark_success(); + return Status::OK; +} + +grpc::Status TransportController::end_replica_fetch( + RpcContext& rctx, + const v2::EndReplicaFetchRequest& req, + v2::EndReplicaFetchResponse& resp) { + auto& span = rctx.span(); + span->SetAttribute("tc.transport.id", req.transport_id()); + if (req.transport_id().empty()) { + return {grpc::StatusCode::INVALID_ARGUMENT, "transport_id is required"}; + } + if (d_.derived_view_exports == nullptr) { + resp.set_managed(false); + rctx.mark_success(); + return Status::OK; + } + const auto reason = req.has_reason() ? std::string_view(req.reason()) : std::string_view("rpc_end_fetch"); + d_.derived_view_exports->end_fetch(req.transport_id(), reason); + resp.set_managed(true); rctx.mark_success(); return Status::OK; } diff --git a/daemon/service/controllers/transport_controller.h b/daemon/service/controllers/transport_controller.h index d3f729425..d29497dc3 100644 --- a/daemon/service/controllers/transport_controller.h +++ b/daemon/service/controllers/transport_controller.h @@ -6,6 +6,7 @@ #include "core/store/store_engine.h" #include "daemon/service/rpc_context.h" +#include "daemon/state/derived_view_export_manager.h" #include "daemon/state/lip_manager.h" #include "daemon/state/transport_lock_manager.h" #include "tensorcast/daemon/v2/store_daemon.grpc.pb.h" @@ -18,6 +19,7 @@ class TransportController { store::StoreEngine& engine; TransportLockManager& locks; LipManager& lip; + DerivedViewExportManager* derived_view_exports{nullptr}; }; explicit TransportController(Dep d) : d_(d) {} @@ -29,6 +31,16 @@ class TransportController { const v2::UnlockTransportChunksRequest& req, v2::UnlockTransportChunksResponse& resp); + grpc::Status begin_replica_fetch( + RpcContext& rctx, + const v2::BeginReplicaFetchRequest& req, + v2::BeginReplicaFetchResponse& resp); + + grpc::Status end_replica_fetch( + RpcContext& rctx, + const v2::EndReplicaFetchRequest& req, + v2::EndReplicaFetchResponse& resp); + private: Dep d_; }; diff --git a/daemon/service/grpc_service_impl.h b/daemon/service/grpc_service_impl.h index dfabe6cf1..2f7f1d222 100644 --- a/daemon/service/grpc_service_impl.h +++ b/daemon/service/grpc_service_impl.h @@ -100,6 +100,16 @@ class StoreDaemonServiceImpl final : public v2::StoreDaemonService::Service { const v2::RegisterVramRegionRequest* req, v2::RegisterVramRegionResponse* resp) override; + grpc::Status BeginReplicaFetch( + grpc::ServerContext* ctx, + const v2::BeginReplicaFetchRequest* req, + v2::BeginReplicaFetchResponse* resp) override; + + grpc::Status EndReplicaFetch( + grpc::ServerContext* ctx, + const v2::EndReplicaFetchRequest* req, + v2::EndReplicaFetchResponse* resp) override; + grpc::Status UnregisterVramRegion( grpc::ServerContext* ctx, const v2::UnregisterVramRegionRequest* req, diff --git a/daemon/service/grpc_service_impl_rpc_delegates.cc b/daemon/service/grpc_service_impl_rpc_delegates.cc index eb3e878f3..81cba8e27 100644 --- a/daemon/service/grpc_service_impl_rpc_delegates.cc +++ b/daemon/service/grpc_service_impl_rpc_delegates.cc @@ -238,6 +238,22 @@ Status StoreDaemonServiceImpl::LockTransportChunks( return transport_controller_->lock(rctx, *req, *resp); } +Status StoreDaemonServiceImpl::BeginReplicaFetch( + grpc::ServerContext* ctx, + const v2::BeginReplicaFetchRequest* req, + v2::BeginReplicaFetchResponse* resp) { + RpcContext rctx{"BeginReplicaFetch", *ctx, opts_.allow_high_card_attrs}; + return transport_controller_->begin_replica_fetch(rctx, *req, *resp); +} + +Status StoreDaemonServiceImpl::EndReplicaFetch( + grpc::ServerContext* ctx, + const v2::EndReplicaFetchRequest* req, + v2::EndReplicaFetchResponse* resp) { + RpcContext rctx{"EndReplicaFetch", *ctx, opts_.allow_high_card_attrs}; + return transport_controller_->end_replica_fetch(rctx, *req, *resp); +} + Status StoreDaemonServiceImpl::PublishTargetReplica( grpc::ServerContext* ctx, const v2::PublishTargetReplicaRequest* req, diff --git a/daemon/state/daemon_kernel.cc b/daemon/state/daemon_kernel.cc index 26074cd97..8182f01e9 100644 --- a/daemon/state/daemon_kernel.cc +++ b/daemon/state/daemon_kernel.cc @@ -33,6 +33,14 @@ DaemonKernel::DaemonKernel( verif_tracker_->set_serial_executor(async_runtime_->serial_executor()); lifecycle_mgr_ = std::make_shared(sessions_, refs_, *lip_mgr_, *engine_); + derived_view_export_mgr_ = std::make_unique( + *engine_, + *lifecycle_mgr_, + DerivedViewExportManager::Options{ + .ttl = absl::Milliseconds(static_cast(options_.derived_view_exports.ttl.count())), + .retry_retire_ttl = + absl::Milliseconds(static_cast(options_.derived_view_exports.retry_retire_ttl.count())), + }); pid_monitor_ = std::make_unique( [this](pid_t pid) { if (this->lifecycle_mgr_) { @@ -199,7 +207,7 @@ void DaemonKernel::configure_scheduler_tasks_() { // Lock TTL { - auto t = std::make_shared(locks_, *engine_); + auto t = std::make_shared(locks_, *engine_, derived_view_export_mgr_.get()); scheduler_->add_task( TaskKind::kLockTTL, std::chrono::duration_cast(options_.locks_sweep_interval), [t]() { t->run_once(); diff --git a/daemon/state/daemon_kernel.h b/daemon/state/daemon_kernel.h index 41d1160c6..a4702d31b 100644 --- a/daemon/state/daemon_kernel.h +++ b/daemon/state/daemon_kernel.h @@ -15,6 +15,7 @@ #include "daemon/state/artifact_source_registry.h" #include "daemon/state/background_scheduler.h" #include "daemon/state/daemon_options.h" +#include "daemon/state/derived_view_export_manager.h" #include "daemon/state/device_resolver.h" #include "daemon/state/handle_lease_registry.h" #include "daemon/state/ipc_region_registry.h" @@ -95,6 +96,10 @@ class DaemonKernel { return *lip_bridge_; } + [[nodiscard]] DerivedViewExportManager& derived_view_export_manager() const { + return *derived_view_export_mgr_; + } + [[nodiscard]] ArtifactSourceRegistry& source_registry() { return source_registry_; } @@ -169,6 +174,7 @@ class DaemonKernel { RefTracker refs_; std::unique_ptr region_registry_; std::unique_ptr lip_mgr_; + std::unique_ptr derived_view_export_mgr_; std::unique_ptr scheduler_; std::shared_ptr lifecycle_mgr_; diff --git a/daemon/state/daemon_options.h b/daemon/state/daemon_options.h index 8e8329e9c..daaab99d0 100644 --- a/daemon/state/daemon_options.h +++ b/daemon/state/daemon_options.h @@ -72,6 +72,13 @@ struct DaemonOptions { // Best-effort guardrail: limit lease-bearing handle mints per second (0 => unlimited). uint32_t handle_lease_max_mints_per_second{0}; + struct DerivedViewExports { + std::chrono::milliseconds ttl{std::chrono::minutes(10)}; + std::chrono::milliseconds retry_retire_ttl{std::chrono::seconds(30)}; + }; + + DerivedViewExports derived_view_exports{}; + // CPU shared-memory materialization (memfd-backed UMA CPU arena). bool cpu_shared_memory_enabled{true}; // Enable verification for MaterializeIntoTarget external target writes. diff --git a/daemon/state/derived_view_export_manager.cc b/daemon/state/derived_view_export_manager.cc new file mode 100644 index 000000000..30b6cf5b6 --- /dev/null +++ b/daemon/state/derived_view_export_manager.cc @@ -0,0 +1,952 @@ +// Copyright (c) 2025-2026, TensorCast Team. + +#include "daemon/state/derived_view_export_manager.h" + +#include + +#include +#include +#include + +#include "absl/log/log.h" +#include "absl/strings/str_cat.h" + +namespace tensorcast::daemon { + +namespace { + +constexpr uint32_t kRetireDrainTimeoutMs = 30000; +constexpr std::string_view kResidentViewRouteKind = "resident_view"; +constexpr uint64_t kDerivedViewBudgetFloorBytes = 16ULL << 30; +constexpr uint64_t kDerivedViewBudgetCeilBytes = 128ULL << 30; +constexpr uint64_t kDerivedViewHeadroomFloorBytes = 64ULL << 30; +constexpr uint64_t kDerivedViewHeadroomCeilBytes = 128ULL << 30; + +struct DerivedBudgetWindow { + uint64_t base_budget_bytes{0}; + uint64_t headroom_bytes{0}; + uint64_t resident_derived_bytes{0}; + uint64_t non_derived_stable_bytes{0}; + uint64_t effective_budget_bytes{0}; +}; + +uint64_t clamp_derived_budget_bytes(uint64_t stable_total_bytes) { + if (stable_total_bytes == 0) { + return 0; + } + const uint64_t quarter_budget = stable_total_bytes / 4; + return std::min(kDerivedViewBudgetCeilBytes, std::max(kDerivedViewBudgetFloorBytes, quarter_budget)); +} + +DerivedBudgetWindow compute_budget_window( + const store::MemoryTierBudget::Snapshot& snapshot, + uint64_t resident_derived_bytes, + const DerivedViewExportManager::Options& options) { + DerivedBudgetWindow window; + window.base_budget_bytes = + options.budget_override_bytes.value_or(clamp_derived_budget_bytes(snapshot.stable_total_bytes)); + if (snapshot.stable_total_bytes == 0) { + window.headroom_bytes = options.headroom_override_bytes.value_or(0); + window.resident_derived_bytes = resident_derived_bytes; + window.effective_budget_bytes = window.base_budget_bytes; + return window; + } + + window.headroom_bytes = options.headroom_override_bytes.value_or( + std::min( + kDerivedViewHeadroomCeilBytes, std::max(kDerivedViewHeadroomFloorBytes, snapshot.stable_total_bytes / 8))); + window.resident_derived_bytes = resident_derived_bytes; + window.non_derived_stable_bytes = + snapshot.stable_used_bytes > resident_derived_bytes ? snapshot.stable_used_bytes - resident_derived_bytes : 0; + if (window.non_derived_stable_bytes + window.headroom_bytes >= snapshot.stable_total_bytes) { + window.effective_budget_bytes = 0; + return window; + } + + const uint64_t headroom_limited_budget = + snapshot.stable_total_bytes - window.non_derived_stable_bytes - window.headroom_bytes; + window.effective_budget_bytes = std::min(window.base_budget_bytes, headroom_limited_budget); + return window; +} + +} // namespace + +DerivedViewExportManager::DerivedViewExportManager(store::StoreEngine& engine, SessionLifecycleManager& lifecycle) + : DerivedViewExportManager(engine, lifecycle, Options{}) {} + +DerivedViewExportManager::DerivedViewExportManager( + store::StoreEngine& engine, + SessionLifecycleManager& lifecycle, + Options options) + : engine_(engine), lifecycle_(lifecycle), options_(std::move(options)), owner_pid_(::getpid()) { + if (options_.ttl <= absl::ZeroDuration()) { + options_.ttl = absl::Minutes(10); + } + if (options_.retry_retire_ttl <= absl::ZeroDuration()) { + options_.retry_retire_ttl = absl::Seconds(30); + } +} + +void DerivedViewExportManager::set_global_store_client(std::shared_ptr client) { + global_store_client_ = std::move(client); +} + +bool DerivedViewExportManager::can_acquire_prepare_budget_locked(const PrepareBudgetWaitContext* ctx) { + return ctx != nullptr && ctx->manager != nullptr && + (ctx->manager->pending_prepare_bytes_ + ctx->reserved_bytes) <= ctx->pending_budget_bytes; +} + +bool DerivedViewExportManager::can_finish_local_drain_locked(const LocalDrainWaitContext* ctx) { + if (ctx == nullptr || ctx->manager == nullptr) { + return true; + } + auto it = ctx->manager->entries_.find(ctx->key); + return it == ctx->manager->entries_.end() || it->second.generation != ctx->generation || + it->second.active_fetches == 0; +} + +absl::Status DerivedViewExportManager::acquire_prepare_budget(uint64_t reserved_bytes) { + if (reserved_bytes == 0) { + return absl::OkStatus(); + } + + for (;;) { + const auto snapshot_opt = engine_.get_memory_tier_snapshot(); + if (!snapshot_opt.has_value()) { + return absl::OkStatus(); + } + + absl::MutexLock lock(&mu_); + const DerivedBudgetWindow budget_window = + compute_budget_window(*snapshot_opt, current_ready_derived_bytes_locked(), options_); + const uint64_t pending_budget_bytes = std::max(reserved_bytes, budget_window.effective_budget_bytes); + PrepareBudgetWaitContext wait_ctx{ + .manager = this, + .reserved_bytes = reserved_bytes, + .pending_budget_bytes = pending_budget_bytes, + }; + if (can_acquire_prepare_budget_locked(&wait_ctx)) { + pending_prepare_bytes_ += reserved_bytes; + VLOG(1) << "DerivedViewExportManager: acquired prepare budget reserved_bytes=" << reserved_bytes + << " pending_prepare_bytes=" << pending_prepare_bytes_ << " pending_budget_bytes=" << pending_budget_bytes + << " effective_budget_bytes=" << budget_window.effective_budget_bytes + << " non_derived_stable_bytes=" << budget_window.non_derived_stable_bytes; + return absl::OkStatus(); + } + + VLOG(1) << "DerivedViewExportManager: waiting for prepare budget reserved_bytes=" << reserved_bytes + << " pending_prepare_bytes=" << pending_prepare_bytes_ << " pending_budget_bytes=" << pending_budget_bytes + << " effective_budget_bytes=" << budget_window.effective_budget_bytes + << " non_derived_stable_bytes=" << budget_window.non_derived_stable_bytes; + mu_.Await(absl::Condition(&DerivedViewExportManager::can_acquire_prepare_budget_locked, &wait_ctx)); + } +} + +void DerivedViewExportManager::release_prepare_budget(uint64_t reserved_bytes) { + if (reserved_bytes == 0) { + return; + } + + absl::MutexLock lock(&mu_); + pending_prepare_bytes_ = pending_prepare_bytes_ > reserved_bytes ? pending_prepare_bytes_ - reserved_bytes : 0; + VLOG(1) << "DerivedViewExportManager: released prepare budget reserved_bytes=" << reserved_bytes + << " pending_prepare_bytes=" << pending_prepare_bytes_; +} + +std::optional DerivedViewExportManager::to_entry_key(const store::loading::ReplicaKey& key) { + if (!key.view_id.has_value() || key.view_id->empty()) { + return std::nullopt; + } + return ArtifactDeviceKey{ + .artifact_id = key.artifact_id, + .view_id = *key.view_id, + .device_id = key.device.type == DeviceType::CPU ? -1 : key.device.ordinal, + }; +} + +absl::Status DerivedViewExportManager::install_entry( + const ArtifactDeviceKey& key, + const store::loading::ReplicaKey& replica_key) { + Entry entry; + entry.replica_key = replica_key; + entry.state = EntryState::kPending; + entry.generation = next_generation_++; + entry.ttl = options_.ttl; + entry.active_fetches = 0; + entry.accept_new_fetches = true; + auto resident_bytes_or = engine_.get_replica_size(replica_key); + if (!resident_bytes_or.ok()) { + LOG(WARNING) << "DerivedViewExportManager: failed to query replica size for artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " generation=" << entry.generation << ": " + << resident_bytes_or.status(); + entry.resident_bytes = 0; + } else { + entry.resident_bytes = *resident_bytes_or; + } + entry.last_access_time = absl::Now(); + entry.expiry_time = absl::InfiniteFuture(); + entries_[key] = std::move(entry); + return activate_reserved_entry(key, entries_[key]); +} + +absl::Status DerivedViewExportManager::activate_reserved_entry(const ArtifactDeviceKey& key, Entry& entry) { + auto use_lease_or = lifecycle_.create_use_lease(entry.replica_key, owner_pid_); + if (!use_lease_or.ok()) { + return use_lease_or.status(); + } + + auto retention_lease_or = lifecycle_.create_retention_lease( + options_.ttl, + {[this, key, generation = entry.generation]() { return this->on_retention_expired(key, generation); }}); + if (!retention_lease_or.ok()) { + lifecycle_.release_lease(*use_lease_or); + return retention_lease_or.status(); + } + + entry.use_lease_id = *use_lease_or; + entry.retention_lease_id = *retention_lease_or; + entry.state = EntryState::kReady; + entry.ttl = options_.ttl; + entry.active_fetches = 0; + entry.accept_new_fetches = true; + auto resident_bytes_or = engine_.get_replica_size(entry.replica_key); + if (!resident_bytes_or.ok()) { + LOG(WARNING) << "DerivedViewExportManager: failed to query replica size for artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " generation=" << entry.generation << ": " + << resident_bytes_or.status(); + } else { + entry.resident_bytes = *resident_bytes_or; + } + entry.last_access_time = absl::Now(); + entry.expiry_time = entry.last_access_time + entry.ttl; + VLOG(1) << "DerivedViewExportManager: installed entry artifact_id=" << key.artifact_id << " view_id=" << key.view_id + << " device_id=" << key.device_id << " generation=" << entry.generation + << " resident_bytes=" << entry.resident_bytes << " event=create" + << " route_kind=" << kResidentViewRouteKind; + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::reserve(const store::loading::ReplicaKey& key, uint64_t reserved_bytes) { + auto entry_key = to_entry_key(key); + if (!entry_key.has_value()) { + return absl::InvalidArgumentError("derived view export reservation requires ReplicaKey.view_id"); + } + + const auto snapshot_opt = engine_.get_memory_tier_snapshot(); + if (!snapshot_opt.has_value()) { + return absl::OkStatus(); + } + const auto resident_derived_bytes_locked = [this]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) { + uint64_t total_bytes = 0; + for (const auto& [entry_key_ignored, entry] : entries_) { + (void)entry_key_ignored; + if (entry.state == EntryState::kPending) { + continue; + } + total_bytes += entry.resident_bytes; + } + return total_bytes; + }; + + std::vector> victims; + uint64_t reserved_generation = 0; + DerivedBudgetWindow budget_window; + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(*entry_key); + if (it != entries_.end()) { + if (it->second.state == EntryState::kReady) { + VLOG(1) << "DerivedViewExportManager: reuse hit artifact_id=" << entry_key->artifact_id + << " view_id=" << entry_key->view_id << " device_id=" << entry_key->device_id + << " generation=" << it->second.generation << " event=reuse_hit" + << " route_kind=" << kResidentViewRouteKind; + return renew_entry(*entry_key, it->second); + } + return absl::FailedPreconditionError("derived view export already pending or draining"); + } + + Entry entry; + entry.replica_key = key; + entry.state = EntryState::kPending; + entry.generation = next_generation_++; + entry.ttl = options_.ttl; + entry.resident_bytes = reserved_bytes; + entry.active_fetches = 0; + entry.accept_new_fetches = true; + entry.last_access_time = absl::Now(); + entry.expiry_time = absl::InfiniteFuture(); + entries_[*entry_key] = std::move(entry); + reserved_generation = entries_[*entry_key].generation; + + budget_window = compute_budget_window(*snapshot_opt, resident_derived_bytes_locked(), options_); + const uint64_t budget_bytes = budget_window.effective_budget_bytes; + if (budget_bytes == 0) { + entries_.erase(*entry_key); + return absl::ResourceExhaustedError( + absl::StrCat( + "derived-view budget exhausted by non-derived stable residency: stable_total_bytes=", + snapshot_opt->stable_total_bytes, + " stable_used_bytes=", + snapshot_opt->stable_used_bytes, + " resident_derived_bytes=", + budget_window.resident_derived_bytes, + " non_derived_stable_bytes=", + budget_window.non_derived_stable_bytes, + " headroom_bytes=", + budget_window.headroom_bytes)); + } + + uint64_t total_bytes = current_derived_bytes_locked(); + if (total_bytes <= budget_bytes) { + VLOG(1) << "DerivedViewExportManager: reserved pending entry artifact_id=" << entry_key->artifact_id + << " view_id=" << entry_key->view_id << " device_id=" << entry_key->device_id + << " generation=" << entries_[*entry_key].generation << " reserved_bytes=" << reserved_bytes + << " budget_bytes=" << budget_bytes << " base_budget_bytes=" << budget_window.base_budget_bytes + << " headroom_bytes=" << budget_window.headroom_bytes + << " non_derived_stable_bytes=" << budget_window.non_derived_stable_bytes + << " resident_derived_bytes=" << budget_window.resident_derived_bytes; + return absl::OkStatus(); + } + + std::vector> candidates; + candidates.reserve(entries_.size()); + for (const auto& [candidate_key, candidate_entry] : entries_) { + if (candidate_entry.state != EntryState::kReady || candidate_entry.active_fetches > 0) { + continue; + } + if (candidate_key == *entry_key) { + continue; + } + candidates.emplace_back(candidate_key, candidate_entry); + } + + std::sort(candidates.begin(), candidates.end(), [entry_key](const auto& lhs, const auto& rhs) { + const bool lhs_cross_artifact = lhs.first.artifact_id != entry_key->artifact_id; + const bool rhs_cross_artifact = rhs.first.artifact_id != entry_key->artifact_id; + if (lhs_cross_artifact != rhs_cross_artifact) { + return lhs_cross_artifact > rhs_cross_artifact; + } + return lhs.second.last_access_time < rhs.second.last_access_time; + }); + + for (const auto& candidate : candidates) { + const bool cross_artifact = candidate.first.artifact_id != entry_key->artifact_id; + if (!cross_artifact && total_bytes <= budget_bytes) { + break; + } + auto candidate_it = entries_.find(candidate.first); + if (candidate_it == entries_.end() || candidate_it->second.generation != candidate.second.generation || + candidate_it->second.state != EntryState::kReady) { + continue; + } + candidate_it->second.state = EntryState::kDraining; + total_bytes = total_bytes > candidate.second.resident_bytes ? total_bytes - candidate.second.resident_bytes : 0; + victims.push_back(candidate); + } + + if (total_bytes > budget_bytes) { + entries_.erase(*entry_key); + return absl::ResourceExhaustedError( + absl::StrCat( + "unable to free enough derived-view budget before export: budget_bytes=", + budget_bytes, + " base_budget_bytes=", + budget_window.base_budget_bytes, + " headroom_bytes=", + budget_window.headroom_bytes, + " non_derived_stable_bytes=", + budget_window.non_derived_stable_bytes, + " current_derived_bytes=", + total_bytes)); + } + } + + if (!victims.empty()) { + VLOG(1) << "DerivedViewExportManager: reservation eviction start victims=" << victims.size() + << " budget_bytes=" << budget_window.effective_budget_bytes + << " base_budget_bytes=" << budget_window.base_budget_bytes + << " headroom_bytes=" << budget_window.headroom_bytes + << " non_derived_stable_bytes=" << budget_window.non_derived_stable_bytes + << " reserved_bytes=" << reserved_bytes; + } + for (const auto& [victim_key, victim_entry] : victims) { + const absl::Status retire_status = + retire_entry(victim_key, victim_entry, "pre_reserve_budget", /*retry_on_failure=*/false); + if (!retire_status.ok()) { + cancel_reserved(key).IgnoreError(); + return retire_status; + } + } + + VLOG(1) << "DerivedViewExportManager: reserved pending entry artifact_id=" << entry_key->artifact_id + << " view_id=" << entry_key->view_id << " device_id=" << entry_key->device_id + << " generation=" << reserved_generation << " reserved_bytes=" << reserved_bytes + << " budget_bytes=" << budget_window.effective_budget_bytes + << " base_budget_bytes=" << budget_window.base_budget_bytes + << " headroom_bytes=" << budget_window.headroom_bytes + << " non_derived_stable_bytes=" << budget_window.non_derived_stable_bytes; + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::commit_reserved(const store::loading::ReplicaKey& key) { + auto entry_key = to_entry_key(key); + if (!entry_key.has_value()) { + return absl::InvalidArgumentError("derived view export commit requires ReplicaKey.view_id"); + } + + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(*entry_key); + if (it == entries_.end()) { + return absl::NotFoundError("derived view export reservation not found"); + } + if (it->second.state == EntryState::kReady) { + return renew_entry(*entry_key, it->second); + } + if (it->second.state != EntryState::kPending) { + return absl::FailedPreconditionError("derived view export reservation is not pending"); + } + return activate_reserved_entry(*entry_key, it->second); + } +} + +absl::Status DerivedViewExportManager::cancel_reserved(const store::loading::ReplicaKey& key) { + auto entry_key = to_entry_key(key); + if (!entry_key.has_value()) { + return absl::InvalidArgumentError("derived view export cancel requires ReplicaKey.view_id"); + } + + absl::MutexLock lock(&mu_); + auto it = entries_.find(*entry_key); + if (it == entries_.end()) { + return absl::NotFoundError("derived view export reservation not found"); + } + if (it->second.state != EntryState::kPending) { + return absl::FailedPreconditionError("derived view export reservation is not pending"); + } + entries_.erase(it); + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::renew_entry(const ArtifactDeviceKey& key, Entry& entry) { + if (entry.state != EntryState::kReady) { + return absl::FailedPreconditionError("derived view export is draining"); + } + absl::Status renew_status = lifecycle_.renew_retention(entry.retention_lease_id, entry.ttl); + if (renew_status.ok()) { + entry.last_access_time = absl::Now(); + entry.expiry_time = entry.last_access_time + entry.ttl; + VLOG(2) << "DerivedViewExportManager: renewed entry artifact_id=" << key.artifact_id << " view_id=" << key.view_id + << " device_id=" << key.device_id << " generation=" << entry.generation << " event=ttl_refresh" + << " route_kind=" << kResidentViewRouteKind; + } + return renew_status; +} + +absl::Status DerivedViewExportManager::retain_or_refresh(const store::loading::ReplicaKey& key) { + auto entry_key = to_entry_key(key); + if (!entry_key.has_value()) { + return absl::InvalidArgumentError("derived view export retention requires ReplicaKey.view_id"); + } + + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(*entry_key); + if (it != entries_.end()) { + VLOG(1) << "DerivedViewExportManager: retain_or_refresh hit artifact_id=" << entry_key->artifact_id + << " view_id=" << entry_key->view_id << " device_id=" << entry_key->device_id + << " generation=" << it->second.generation << " event=reuse_hit" + << " route_kind=" << kResidentViewRouteKind; + return renew_entry(*entry_key, it->second); + } + absl::Status install_status = install_entry(*entry_key, key); + if (!install_status.ok()) { + return install_status; + } + } + return maybe_evict_for_budget(*entry_key); +} + +absl::Status DerivedViewExportManager::begin_fetch(const store::loading::ReplicaKey& key, std::string_view fetch_id) { + auto entry_key = to_entry_key(key); + if (!entry_key.has_value()) { + return absl::InvalidArgumentError("derived view export begin_fetch requires ReplicaKey.view_id"); + } + if (fetch_id.empty()) { + return absl::InvalidArgumentError("derived view export begin_fetch requires fetch_id"); + } + + absl::MutexLock lock(&mu_); + auto it = entries_.find(*entry_key); + if (it == entries_.end()) { + return absl::NotFoundError("derived view export not managed"); + } + auto fetch_it = active_fetches_.find(std::string(fetch_id)); + if (fetch_it != active_fetches_.end()) { + if (fetch_it->second.key == *entry_key && fetch_it->second.generation == it->second.generation) { + VLOG(2) << "DerivedViewExportManager: begin fetch idempotent artifact_id=" << entry_key->artifact_id + << " view_id=" << entry_key->view_id << " generation=" << it->second.generation + << " fetch_id=" << fetch_id; + return absl::OkStatus(); + } + return absl::AlreadyExistsError( + absl::StrCat("derived view export fetch_id already bound to another entry: fetch_id=", fetch_id)); + } + if (it->second.state == EntryState::kPending) { + return absl::FailedPreconditionError( + "derived view export fetch rejected: entry is still preparing; fallback_reason=entry_pending"); + } + if (!it->second.accept_new_fetches) { + return absl::FailedPreconditionError( + "derived view export fetch rejected: draining entry no longer accepts attaches; " + "fallback_reason=drain_attach_closed"); + } + + if (it->second.state == EntryState::kReady) { + const absl::Status renew_status = renew_entry(*entry_key, it->second); + if (!renew_status.ok()) { + LOG(WARNING) << "DerivedViewExportManager: data-plane TTL refresh failed for artifact_id=" + << entry_key->artifact_id << " view_id=" << entry_key->view_id + << " generation=" << it->second.generation << ": " << renew_status; + } + } else { + it->second.last_access_time = absl::Now(); + VLOG(1) << "DerivedViewExportManager: serving in-flight fetch on draining entry artifact_id=" + << entry_key->artifact_id << " view_id=" << entry_key->view_id << " generation=" << it->second.generation; + } + + it->second.active_fetches += 1; + active_fetches_.emplace(std::string(fetch_id), ActiveFetch{.key = *entry_key, .generation = it->second.generation}); + VLOG(2) << "DerivedViewExportManager: begin fetch artifact_id=" << entry_key->artifact_id + << " view_id=" << entry_key->view_id << " generation=" << it->second.generation + << " state=" << static_cast(it->second.state) << " active_fetches=" << it->second.active_fetches + << " fetch_id=" << fetch_id << " event=fetch_begin" + << " route_kind=" << kResidentViewRouteKind; + return absl::OkStatus(); +} + +void DerivedViewExportManager::end_fetch(std::string_view fetch_id, std::string_view reason) { + if (fetch_id.empty()) { + return; + } + + absl::MutexLock lock(&mu_); + auto fetch_it = active_fetches_.find(std::string(fetch_id)); + if (fetch_it == active_fetches_.end()) { + VLOG(2) << "DerivedViewExportManager: end fetch ignored unknown fetch_id=" << fetch_id << " reason=" << reason; + return; + } + const ArtifactDeviceKey entry_key = fetch_it->second.key; + const uint64_t generation = fetch_it->second.generation; + active_fetches_.erase(fetch_it); + + auto it = entries_.find(entry_key); + if (it == entries_.end()) { + return; + } + if (it->second.generation != generation) { + VLOG(2) << "DerivedViewExportManager: end fetch ignored generation mismatch artifact_id=" << entry_key.artifact_id + << " view_id=" << entry_key.view_id << " fetch_id=" << fetch_id << " reason=" << reason + << " expected_generation=" << generation << " actual_generation=" << it->second.generation; + return; + } + if (it->second.active_fetches == 0) { + VLOG(1) << "DerivedViewExportManager: end fetch observed zero active_fetches artifact_id=" << entry_key.artifact_id + << " view_id=" << entry_key.view_id << " reason=" << reason << " fetch_id=" << fetch_id; + return; + } + + it->second.active_fetches -= 1; + VLOG(2) << "DerivedViewExportManager: end fetch artifact_id=" << entry_key.artifact_id + << " view_id=" << entry_key.view_id << " generation=" << it->second.generation + << " state=" << static_cast(it->second.state) << " active_fetches=" << it->second.active_fetches + << " reason=" << reason << " fetch_id=" << fetch_id << " event=fetch_end" + << " route_kind=" << kResidentViewRouteKind; +} + +std::optional DerivedViewExportManager::find_entry( + const store::loading::ReplicaKey& key) const { + auto entry_key = to_entry_key(key); + if (!entry_key.has_value()) { + return std::nullopt; + } + + absl::MutexLock lock(&mu_); + auto it = entries_.find(*entry_key); + if (it == entries_.end()) { + return std::nullopt; + } + return EntrySnapshot{ + .key = *entry_key, + .replica_key = it->second.replica_key, + .use_lease_id = it->second.use_lease_id, + .retention_lease_id = it->second.retention_lease_id, + .state = it->second.state, + .generation = it->second.generation, + .ttl = it->second.ttl, + .resident_bytes = it->second.resident_bytes, + .active_fetches = it->second.active_fetches, + .accept_new_fetches = it->second.accept_new_fetches, + .last_access_time = it->second.last_access_time, + .expiry_time = it->second.expiry_time, + }; +} + +absl::Status DerivedViewExportManager::safe_retire_published_replica( + const store::loading::ReplicaKey& key, + std::string_view reason) const { + engine_.set_replica_publish_state(key, store::StoreEngine::ReplicaPublishState::kRetiring); + + const auto replica_id = engine_.get_replica_global_store_id(key); + if (!replica_id.has_value() || replica_id->empty()) { + return absl::OkStatus(); + } + if (!global_store_client_ || !global_store_client_->is_connected()) { + return absl::FailedPreconditionError("Global Store client unavailable while retiring derived view export"); + } + + auto mark_or = global_store_client_->mark_replica_unavailable(key.artifact_id, *replica_id, std::string(reason)); + if (!mark_or.ok() && !absl::IsNotFound(mark_or.status())) { + return mark_or.status(); + } + + auto drain_or = global_store_client_->wait_replica_drain(*replica_id, kRetireDrainTimeoutMs); + if (!drain_or.ok() && !absl::IsNotFound(drain_or.status())) { + return drain_or.status(); + } + if (drain_or.ok() && !drain_or->drained) { + return absl::DeadlineExceededError("derived view export drain timed out"); + } + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::unregister_published_replica(const store::loading::ReplicaKey& key) const { + const auto replica_id = engine_.get_replica_global_store_id(key); + if (!replica_id.has_value() || replica_id->empty()) { + return absl::OkStatus(); + } + if (!global_store_client_ || !global_store_client_->is_connected()) { + return absl::FailedPreconditionError("Global Store client unavailable while unregistering derived view export"); + } + absl::Status unregister_status = global_store_client_->unregister_replica(key.artifact_id, *replica_id); + if (!unregister_status.ok() && !absl::IsNotFound(unregister_status)) { + return unregister_status; + } + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::arm_retry_retention(const ArtifactDeviceKey& key, uint64_t generation) { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it == entries_.end() || it->second.generation != generation) { + return absl::OkStatus(); + } + + auto retention_lease_or = lifecycle_.create_retention_lease( + options_.retry_retire_ttl, {[this, key, generation]() { return this->on_retention_expired(key, generation); }}); + if (!retention_lease_or.ok()) { + return retention_lease_or.status(); + } + + it->second.retention_lease_id = *retention_lease_or; + it->second.state = EntryState::kReady; + it->second.ttl = options_.ttl; + it->second.accept_new_fetches = true; + it->second.expiry_time = absl::Now() + options_.ttl; + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::arm_retry_retire(const ArtifactDeviceKey& key, uint64_t generation) { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it == entries_.end() || it->second.generation != generation) { + return absl::OkStatus(); + } + + auto retention_lease_or = lifecycle_.create_retention_lease( + options_.retry_retire_ttl, {[this, key, generation]() { return this->on_retry_retire(key, generation); }}); + if (!retention_lease_or.ok()) { + return retention_lease_or.status(); + } + + it->second.retention_lease_id = *retention_lease_or; + it->second.expiry_time = absl::Now() + options_.retry_retire_ttl; + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::on_retry_retire(const ArtifactDeviceKey& key, uint64_t generation) { + Entry snapshot; + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it == entries_.end() || it->second.generation != generation) { + return absl::OkStatus(); + } + if (it->second.state != EntryState::kDraining) { + return absl::OkStatus(); + } + snapshot = it->second; + } + return retire_entry(key, snapshot, "retry_retire", /*retry_on_failure=*/true); +} + +uint64_t DerivedViewExportManager::current_derived_bytes_locked() const { + uint64_t total_bytes = 0; + for (const auto& [entry_key, entry] : entries_) { + (void)entry_key; + if (entry.state == EntryState::kDraining) { + continue; + } + total_bytes += entry.resident_bytes; + } + return total_bytes; +} + +uint64_t DerivedViewExportManager::current_ready_derived_bytes_locked() const { + uint64_t total_bytes = 0; + for (const auto& [entry_key, entry] : entries_) { + (void)entry_key; + if (entry.state != EntryState::kReady) { + continue; + } + total_bytes += entry.resident_bytes; + } + return total_bytes; +} + +absl::Status DerivedViewExportManager::maybe_evict_for_budget(const std::optional& protected_key) { + const auto snapshot_opt = engine_.get_memory_tier_snapshot(); + if (!snapshot_opt.has_value()) { + return absl::OkStatus(); + } + const auto resident_derived_bytes_locked = [this]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) { + uint64_t total_bytes = 0; + for (const auto& [entry_key_ignored, entry] : entries_) { + (void)entry_key_ignored; + if (entry.state == EntryState::kPending) { + continue; + } + total_bytes += entry.resident_bytes; + } + return total_bytes; + }; + DerivedBudgetWindow budget_window; + uint64_t budget_bytes = 0; + uint64_t total_bytes = 0; + std::vector> victims; + { + absl::MutexLock lock(&mu_); + budget_window = compute_budget_window(*snapshot_opt, resident_derived_bytes_locked(), options_); + budget_bytes = budget_window.effective_budget_bytes; + total_bytes = current_derived_bytes_locked(); + if (total_bytes <= budget_bytes) { + return absl::OkStatus(); + } + + std::vector> candidates; + candidates.reserve(entries_.size()); + for (const auto& [entry_key, entry] : entries_) { + if (entry.state != EntryState::kReady || entry.active_fetches > 0) { + continue; + } + if (protected_key.has_value() && *protected_key == entry_key) { + continue; + } + candidates.emplace_back(entry_key, entry); + } + + const absl::Time now = absl::Now(); + std::sort(candidates.begin(), candidates.end(), [now](const auto& lhs, const auto& rhs) { + const bool lhs_expired = lhs.second.expiry_time <= now; + const bool rhs_expired = rhs.second.expiry_time <= now; + if (lhs_expired != rhs_expired) { + return lhs_expired > rhs_expired; + } + return lhs.second.last_access_time < rhs.second.last_access_time; + }); + + for (const auto& candidate : candidates) { + if (total_bytes <= budget_bytes) { + break; + } + auto it = entries_.find(candidate.first); + if (it == entries_.end() || it->second.generation != candidate.second.generation || + it->second.state != EntryState::kReady) { + continue; + } + it->second.state = EntryState::kDraining; + total_bytes = total_bytes > candidate.second.resident_bytes ? total_bytes - candidate.second.resident_bytes : 0; + victims.push_back(candidate); + } + } + + if (victims.empty()) { + VLOG(1) << "DerivedViewExportManager: derived-view budget exceeded but no idle victims available" + << " total_bytes=" << total_bytes << " budget_bytes=" << budget_bytes + << " base_budget_bytes=" << budget_window.base_budget_bytes + << " headroom_bytes=" << budget_window.headroom_bytes + << " non_derived_stable_bytes=" << budget_window.non_derived_stable_bytes; + return absl::OkStatus(); + } + + VLOG(1) << "DerivedViewExportManager: pressure eviction start victims=" << victims.size() + << " budget_bytes=" << budget_bytes << " base_budget_bytes=" << budget_window.base_budget_bytes + << " headroom_bytes=" << budget_window.headroom_bytes + << " non_derived_stable_bytes=" << budget_window.non_derived_stable_bytes << " event=eviction_start" + << " route_kind=" << kResidentViewRouteKind; + for (const auto& [victim_key, victim_entry] : victims) { + const absl::Status retire_status = + retire_entry(victim_key, victim_entry, "pressure_budget", /*retry_on_failure=*/false); + if (!retire_status.ok()) { + LOG(WARNING) << "DerivedViewExportManager: pressure eviction failed for artifact_id=" << victim_key.artifact_id + << " view_id=" << victim_key.view_id << " generation=" << victim_entry.generation << ": " + << retire_status; + } + } + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::retire_entry( + const ArtifactDeviceKey& key, + const Entry& snapshot, + std::string_view reason, + bool retry_on_failure) { + VLOG(1) << "DerivedViewExportManager: retiring entry artifact_id=" << key.artifact_id << " view_id=" << key.view_id + << " device_id=" << key.device_id << " generation=" << snapshot.generation << " reason=" << reason + << " resident_bytes=" << snapshot.resident_bytes << " active_fetches=" << snapshot.active_fetches + << " event=drain_start" + << " route_kind=" << kResidentViewRouteKind; + + bool route_withdrawn = false; + auto handle_retire_failure = [&](const absl::Status& status) -> absl::Status { + LOG(WARNING) << "DerivedViewExportManager: retire step failed for artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " generation=" << snapshot.generation << " reason=" << reason + << " route_withdrawn=" << route_withdrawn << ": " << status; + if (!route_withdrawn) { + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it != entries_.end() && it->second.generation == snapshot.generation) { + it->second.state = EntryState::kReady; + it->second.accept_new_fetches = true; + } + } + if (retry_on_failure) { + const absl::Status retry_status = arm_retry_retention(key, snapshot.generation); + if (!retry_status.ok()) { + LOG(WARNING) << "DerivedViewExportManager: retry retention arm failed for artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " generation=" << snapshot.generation << ": " << retry_status; + } + } + engine_.set_replica_publish_state(snapshot.replica_key, store::StoreEngine::ReplicaPublishState::kPublished); + return status; + } + + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it != entries_.end() && it->second.generation == snapshot.generation) { + it->second.state = EntryState::kDraining; + it->second.accept_new_fetches = false; + } + } + const absl::Status retry_status = arm_retry_retire(key, snapshot.generation); + if (!retry_status.ok()) { + LOG(WARNING) << "DerivedViewExportManager: retry retire arm failed for artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " generation=" << snapshot.generation << ": " << retry_status; + } + return status; + }; + + const absl::Status retire_status = safe_retire_published_replica(snapshot.replica_key, reason); + if (!retire_status.ok()) { + return handle_retire_failure(retire_status); + } + route_withdrawn = true; + + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it != entries_.end() && it->second.generation == snapshot.generation) { + it->second.accept_new_fetches = false; + } + } + + LocalDrainWaitContext wait_ctx{ + .manager = this, + .key = key, + .generation = snapshot.generation, + }; + bool local_drain_complete = false; + { + absl::MutexLock lock(&mu_); + local_drain_complete = mu_.AwaitWithTimeout( + absl::Condition(&DerivedViewExportManager::can_finish_local_drain_locked, &wait_ctx), + absl::Milliseconds(kRetireDrainTimeoutMs)); + } + if (!local_drain_complete) { + return handle_retire_failure( + absl::DeadlineExceededError( + absl::StrCat( + "derived view export local drain timed out: artifact_id=", + key.artifact_id, + " view_id=", + key.view_id, + " generation=", + snapshot.generation))); + } + VLOG(1) << "DerivedViewExportManager: local drain complete artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " generation=" << snapshot.generation << " event=drain_complete" + << " route_kind=" << kResidentViewRouteKind; + + const absl::Status unregister_status = unregister_published_replica(snapshot.replica_key); + if (!unregister_status.ok()) { + return handle_retire_failure(unregister_status); + } + + lifecycle_.release_lease(snapshot.retention_lease_id); + lifecycle_.release_lease(snapshot.use_lease_id); + const absl::Status retire_status_runtime = engine_.retire_replica_status(snapshot.replica_key); + if (!retire_status_runtime.ok() && !absl::IsNotFound(retire_status_runtime)) { + LOG(WARNING) << "DerivedViewExportManager: runtime retire failed for artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " generation=" << snapshot.generation << " reason=" << reason << ": " + << retire_status_runtime; + } + + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it != entries_.end() && it->second.generation == snapshot.generation) { + entries_.erase(it); + } + } + + VLOG(1) << "DerivedViewExportManager: retired entry artifact_id=" << key.artifact_id << " view_id=" << key.view_id + << " device_id=" << key.device_id << " generation=" << snapshot.generation << " reason=" << reason + << " event=retire_complete" + << " route_kind=" << kResidentViewRouteKind; + return absl::OkStatus(); +} + +absl::Status DerivedViewExportManager::on_retention_expired(const ArtifactDeviceKey& key, uint64_t generation) { + Entry snapshot; + { + absl::MutexLock lock(&mu_); + auto it = entries_.find(key); + if (it == entries_.end() || it->second.generation != generation) { + return absl::OkStatus(); + } + if (it->second.state != EntryState::kReady) { + VLOG(2) << "DerivedViewExportManager: skip retention expiry for artifact_id=" << key.artifact_id + << " view_id=" << key.view_id << " device_id=" << key.device_id << " generation=" << generation + << " because state=" << static_cast(it->second.state); + return absl::OkStatus(); + } + it->second.state = EntryState::kDraining; + it->second.accept_new_fetches = true; + snapshot = it->second; + } + return retire_entry(key, snapshot, "ttl_expired", /*retry_on_failure=*/true); +} + +} // namespace tensorcast::daemon diff --git a/daemon/state/derived_view_export_manager.h b/daemon/state/derived_view_export_manager.h new file mode 100644 index 000000000..c3f8a8c0d --- /dev/null +++ b/daemon/state/derived_view_export_manager.h @@ -0,0 +1,166 @@ +// Copyright (c) 2025-2026, TensorCast Team. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/container/flat_hash_map.h" +#include "absl/status/status.h" +#include "absl/synchronization/mutex.h" +#include "absl/time/time.h" +#include "core/store/components/global_store_client.h" +#include "core/store/materialization/contracts/loading_spec.h" +#include "core/store/store_engine.h" +#include "daemon/state/session_lifecycle.h" +#include "daemon/state/types.h" + +namespace tensorcast::daemon { + +class DerivedViewExportManager { + public: + struct Options { + absl::Duration ttl{absl::Minutes(10)}; + absl::Duration retry_retire_ttl{absl::Seconds(30)}; + std::optional budget_override_bytes; + std::optional headroom_override_bytes; + }; + + enum class EntryState { + kPending = 0, + kReady = 1, + kDraining = 2, + }; + + struct EntrySnapshot { + ArtifactDeviceKey key; + store::loading::ReplicaKey replica_key; + SessionLifecycleManager::LeaseId use_lease_id{0}; + SessionLifecycleManager::LeaseId retention_lease_id{0}; + EntryState state{EntryState::kReady}; + uint64_t generation{0}; + absl::Duration ttl{absl::ZeroDuration()}; + uint64_t resident_bytes{0}; + uint64_t active_fetches{0}; + bool accept_new_fetches{true}; + absl::Time last_access_time{absl::InfinitePast()}; + absl::Time expiry_time{absl::InfiniteFuture()}; + }; + + DerivedViewExportManager(store::StoreEngine& engine, SessionLifecycleManager& lifecycle); + + DerivedViewExportManager(store::StoreEngine& engine, SessionLifecycleManager& lifecycle, Options options); + + void set_global_store_client(std::shared_ptr client); + + [[nodiscard]] absl::Status acquire_prepare_budget(uint64_t reserved_bytes); + + void release_prepare_budget(uint64_t reserved_bytes); + + [[nodiscard]] absl::Status reserve(const store::loading::ReplicaKey& key, uint64_t reserved_bytes); + + [[nodiscard]] absl::Status commit_reserved(const store::loading::ReplicaKey& key); + + [[nodiscard]] absl::Status cancel_reserved(const store::loading::ReplicaKey& key); + + [[nodiscard]] absl::Status retain_or_refresh(const store::loading::ReplicaKey& key); + + [[nodiscard]] absl::Status begin_fetch(const store::loading::ReplicaKey& key, std::string_view fetch_id); + + void end_fetch(std::string_view fetch_id, std::string_view reason); + + [[nodiscard]] std::optional find_entry(const store::loading::ReplicaKey& key) const; + + private: + struct PrepareBudgetWaitContext { + const DerivedViewExportManager* manager{nullptr}; + uint64_t reserved_bytes{0}; + uint64_t pending_budget_bytes{0}; + }; + + struct LocalDrainWaitContext { + const DerivedViewExportManager* manager{nullptr}; + ArtifactDeviceKey key; + uint64_t generation{0}; + }; + + struct Entry { + store::loading::ReplicaKey replica_key; + SessionLifecycleManager::LeaseId use_lease_id{0}; + SessionLifecycleManager::LeaseId retention_lease_id{0}; + EntryState state{EntryState::kReady}; + uint64_t generation{0}; + absl::Duration ttl{absl::ZeroDuration()}; + uint64_t resident_bytes{0}; + uint64_t active_fetches{0}; + bool accept_new_fetches{true}; + absl::Time last_access_time{absl::InfinitePast()}; + absl::Time expiry_time{absl::InfiniteFuture()}; + }; + + struct ActiveFetch { + ArtifactDeviceKey key; + uint64_t generation{0}; + }; + + [[nodiscard]] static std::optional to_entry_key(const store::loading::ReplicaKey& key); + + [[nodiscard]] absl::Status install_entry(const ArtifactDeviceKey& key, const store::loading::ReplicaKey& replica_key) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); + + [[nodiscard]] absl::Status activate_reserved_entry(const ArtifactDeviceKey& key, Entry& entry) + ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); + + [[nodiscard]] absl::Status renew_entry(const ArtifactDeviceKey& key, Entry& entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); + + [[nodiscard]] absl::Status on_retention_expired(const ArtifactDeviceKey& key, uint64_t generation); + + [[nodiscard]] absl::Status safe_retire_published_replica( + const store::loading::ReplicaKey& key, + std::string_view reason) const; + + [[nodiscard]] absl::Status unregister_published_replica(const store::loading::ReplicaKey& key) const; + + [[nodiscard]] absl::Status arm_retry_retention(const ArtifactDeviceKey& key, uint64_t generation); + + [[nodiscard]] absl::Status arm_retry_retire(const ArtifactDeviceKey& key, uint64_t generation); + + [[nodiscard]] absl::Status on_retry_retire(const ArtifactDeviceKey& key, uint64_t generation); + + [[nodiscard]] static bool can_acquire_prepare_budget_locked(const PrepareBudgetWaitContext* ctx) + ABSL_NO_THREAD_SAFETY_ANALYSIS; + + [[nodiscard]] static bool can_finish_local_drain_locked(const LocalDrainWaitContext* ctx) + ABSL_NO_THREAD_SAFETY_ANALYSIS; + + [[nodiscard]] absl::Status maybe_evict_for_budget(const std::optional& protected_key); + + [[nodiscard]] absl::Status retire_entry( + const ArtifactDeviceKey& key, + const Entry& snapshot, + std::string_view reason, + bool retry_on_failure); + + [[nodiscard]] uint64_t current_ready_derived_bytes_locked() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); + + [[nodiscard]] uint64_t current_derived_bytes_locked() const ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_); + + store::StoreEngine& engine_; + SessionLifecycleManager& lifecycle_; + Options options_; + std::shared_ptr global_store_client_; + pid_t owner_pid_{0}; + + mutable absl::Mutex mu_; + absl::flat_hash_map entries_ ABSL_GUARDED_BY(mu_); + absl::flat_hash_map active_fetches_ ABSL_GUARDED_BY(mu_); + uint64_t pending_prepare_bytes_ ABSL_GUARDED_BY(mu_){0}; + uint64_t next_generation_ ABSL_GUARDED_BY(mu_){1}; +}; + +} // namespace tensorcast::daemon diff --git a/daemon/state/derived_view_export_manager_eviction_test.cc b/daemon/state/derived_view_export_manager_eviction_test.cc new file mode 100644 index 000000000..b488ffdd6 --- /dev/null +++ b/daemon/state/derived_view_export_manager_eviction_test.cc @@ -0,0 +1,213 @@ +// Copyright (c) 2025-2026, TensorCast Team. + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/status/status.h" +#include "absl/strings/str_cat.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "core/store/device_registry.h" +#include "core/store/materialization/dataplane/view/view_identity.h" +#include "core/store/store_engine.h" +#include "core/store/testing/global_store_client_stub.h" +#include "daemon/state/derived_view_export_manager.h" +#include "daemon/state/lip_manager.h" +#include "daemon/state/ref_tracker.h" +#include "daemon/state/replica_session_manager.h" +#include "daemon/state/session_lifecycle.h" + +namespace { + +using tensorcast::daemon::DerivedViewExportManager; +using tensorcast::daemon::RefTracker; +using tensorcast::daemon::ReplicaSessionManager; +using tensorcast::daemon::SessionLifecycleManager; +using tensorcast::store::DeviceRegistry; +using tensorcast::store::StoreEngine; +using tensorcast::store::StoreEngineOptions; +using tensorcast::store::components::ReplicaDrainStatus; +using tensorcast::store::loader::ViewSpec; +using tensorcast::store::loading::ReplicaKey; +using tensorcast::store::materialization::view::NarrowOp; +using tensorcast::store::materialization::view::TensorViewOps; +using tensorcast::store::materialization::view::ViewOp; +using tensorcast::store::runtime::metadata::ArtifactRegistration; +using tensorcast::store::runtime::metadata::ViewPlacement; +using tensorcast::store::runtime::metadata::ViewRegistration; +using tensorcast::store::runtime::metadata::ViewRegistrationKind; +using tensorcast::store::testing::GlobalStoreClientStub; + +class NoopGlobalStoreClient final : public GlobalStoreClientStub { + public: + absl::Status unregister_replica(std::string_view, std::string_view) override { + return absl::OkStatus(); + } + + absl::StatusOr mark_replica_unavailable( + std::string_view, + std::string_view, + std::optional, + std::optional) override { + return true; + } + + absl::StatusOr wait_replica_drain(std::string_view, uint32_t, std::optional) + override { + return ReplicaDrainStatus{.drained = true}; + } +}; + +StoreEngineOptions make_engine_opts(const std::filesystem::path& temp_root) { + StoreEngineOptions opts; + opts.storage_path = temp_root.string(); + opts.memory_pool_size = 32ULL * 1024 * 1024; + opts.tx_slice_bytes = 1ULL << 20; + opts.num_thread = 2; + opts.pinned_memory_timeout = std::chrono::milliseconds(0); + tensorcast::store::MemoryTierConfig tiers; + tiers.stable_bytes = 1ULL << 20; + opts.memory_tier_config = tiers; + return opts; +} + +struct ManagerHarness { + explicit ManagerHarness(const DerivedViewExportManager::Options& options) + : temp_root( + std::filesystem::temp_directory_path() / + absl::StrCat("derived-view-eviction-", absl::ToUnixMicros(absl::Now()))), + engine(std::make_shared(make_engine_opts(temp_root))), + sessions(std::chrono::seconds(60)), + lip(engine, nullptr), + lifecycle(sessions, refs, lip), + manager(*engine, lifecycle, options) { + std::filesystem::create_directories(temp_root); + manager.set_global_store_client(std::make_shared()); + } + + ~ManagerHarness() { + std::error_code ec; + std::filesystem::remove_all(temp_root, ec); + } + + std::filesystem::path temp_root; + std::shared_ptr engine; + ReplicaSessionManager sessions; + RefTracker refs; + tensorcast::daemon::LipManager lip; + SessionLifecycleManager lifecycle; + DerivedViewExportManager manager; +}; + +std::string make_index_json(int64_t numel) { + const uint64_t size_bytes = static_cast(numel) * sizeof(float); + return absl::StrCat("{\"x\":[0,", size_bytes, ",[", numel, "],[1],\"torch.float32\",0]}"); +} + +ViewSpec make_full_view_spec(int64_t numel) { + ViewSpec spec; + TensorViewOps ops; + ops.ops.push_back(ViewOp::Narrow(NarrowOp{.dim = 0, .start = 0, .length = static_cast(numel)})); + spec.tensors.emplace("x", std::move(ops)); + return spec; +} + +ReplicaKey register_view_replica(StoreEngine& engine, std::string artifact_id, std::string view_id, int64_t numel = 2) { + ArtifactRegistration reg; + reg.artifact_id = artifact_id; + reg.tensor_index_data = make_index_json(numel); + reg.schema_version = "v3"; + reg.encoding = "json"; + reg.device_id = 0; + reg.total_size_bytes = static_cast(numel) * sizeof(float); + + ViewRegistration view; + view.view_id = view_id; + view.spec = make_full_view_spec(numel); + view.placement = ViewPlacement::kClient; + view.canonical_size_bytes = reg.total_size_bytes; + view.registration_kind = ViewRegistrationKind::kCanonical; + reg.view = std::move(view); + + auto begin_or = engine.begin_register_artifact(reg); + REQUIRE(begin_or.ok()); + auto commit_or = engine.commit_registered_artifact(begin_or->registration_id); + REQUIRE(commit_or.ok()); + REQUIRE(commit_or->view_id.has_value()); + REQUIRE(*commit_or->view_id == view_id); + return ReplicaKey{ + .artifact_id = commit_or->artifact_id, + .view_id = std::optional(std::move(view_id)), + .device = DeviceRegistry::instance().gpu_key(0), + .replica = 0, + }; +} + +DerivedViewExportManager::EntrySnapshot require_entry(DerivedViewExportManager& manager, const ReplicaKey& key) { + auto snapshot = manager.find_entry(key); + REQUIRE(snapshot.has_value()); + return *snapshot; +} + +} // namespace + +TEST_CASE( + "DerivedViewExportManager evicts expired and oldest idle entries before active or pending entries", + "[daemon][derived_view_export_manager][eviction]") { + DerivedViewExportManager::Options options; + options.ttl = absl::Milliseconds(60); + options.budget_override_bytes = 40; + options.headroom_override_bytes = 0; + ManagerHarness harness(options); + + const ReplicaKey expired_idle = register_view_replica(*harness.engine, "artifact-expired", "view-expired"); + REQUIRE(harness.manager.retain_or_refresh(expired_idle).ok()); + absl::SleepFor(absl::Milliseconds(70)); + + const ReplicaKey older_idle = register_view_replica(*harness.engine, "artifact-older", "view-older"); + REQUIRE(harness.manager.retain_or_refresh(older_idle).ok()); + absl::SleepFor(absl::Milliseconds(5)); + + const ReplicaKey newer_idle = register_view_replica(*harness.engine, "artifact-newer", "view-newer"); + REQUIRE(harness.manager.retain_or_refresh(newer_idle).ok()); + absl::SleepFor(absl::Milliseconds(5)); + + const ReplicaKey active_entry = register_view_replica(*harness.engine, "artifact-active", "view-active"); + REQUIRE(harness.manager.retain_or_refresh(active_entry).ok()); + REQUIRE(harness.manager.begin_fetch(active_entry, "active-fetch").ok()); + + const ReplicaKey pending_entry{ + .artifact_id = "artifact-pending", + .view_id = std::optional("view-pending"), + .device = DeviceRegistry::instance().gpu_key(0), + .replica = 0, + }; + REQUIRE(harness.manager.reserve(pending_entry, /*reserved_bytes=*/8).ok()); + const auto pending_snapshot = require_entry(harness.manager, pending_entry); + REQUIRE(pending_snapshot.state == DerivedViewExportManager::EntryState::kPending); + + const ReplicaKey first_trigger = register_view_replica(*harness.engine, "artifact-trigger-1", "view-trigger-1"); + REQUIRE(harness.manager.retain_or_refresh(first_trigger).ok()); + REQUIRE_FALSE(harness.manager.find_entry(expired_idle).has_value()); + REQUIRE(harness.manager.find_entry(older_idle).has_value()); + REQUIRE(harness.manager.find_entry(newer_idle).has_value()); + REQUIRE(harness.manager.find_entry(active_entry).has_value()); + REQUIRE(harness.manager.find_entry(pending_entry).has_value()); + + const ReplicaKey second_trigger = register_view_replica(*harness.engine, "artifact-trigger-2", "view-trigger-2"); + REQUIRE(harness.manager.retain_or_refresh(second_trigger).ok()); + REQUIRE_FALSE(harness.manager.find_entry(older_idle).has_value()); + REQUIRE(harness.manager.find_entry(newer_idle).has_value()); + const auto active_snapshot = require_entry(harness.manager, active_entry); + REQUIRE(active_snapshot.active_fetches == 1); + REQUIRE(harness.manager.find_entry(pending_entry).has_value()); + + harness.manager.end_fetch("active-fetch", "test_complete"); +} diff --git a/daemon/state/derived_view_export_manager_test.cc b/daemon/state/derived_view_export_manager_test.cc new file mode 100644 index 000000000..f8c430bab --- /dev/null +++ b/daemon/state/derived_view_export_manager_test.cc @@ -0,0 +1,232 @@ +// Copyright (c) 2025-2026, TensorCast Team. + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/status/status.h" +#include "absl/strings/str_cat.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "core/store/device_registry.h" +#include "core/store/materialization/dataplane/view/view_identity.h" +#include "core/store/store_engine.h" +#include "core/store/testing/global_store_client_stub.h" +#include "daemon/state/derived_view_export_manager.h" +#include "daemon/state/lip_manager.h" +#include "daemon/state/ref_tracker.h" +#include "daemon/state/replica_session_manager.h" +#include "daemon/state/session_lifecycle.h" + +namespace { + +using tensorcast::daemon::DerivedViewExportManager; +using tensorcast::daemon::RefTracker; +using tensorcast::daemon::ReplicaSessionManager; +using tensorcast::daemon::SessionLifecycleManager; +using tensorcast::store::DeviceRegistry; +using tensorcast::store::StoreEngine; +using tensorcast::store::StoreEngineOptions; +using tensorcast::store::components::ReplicaDrainStatus; +using tensorcast::store::loader::ViewSpec; +using tensorcast::store::loading::ReplicaKey; +using tensorcast::store::materialization::view::NarrowOp; +using tensorcast::store::materialization::view::TensorViewOps; +using tensorcast::store::materialization::view::ViewOp; +using tensorcast::store::runtime::metadata::ViewPlacement; +using tensorcast::store::runtime::metadata::ViewRegistration; +using tensorcast::store::runtime::metadata::ViewRegistrationKind; +using tensorcast::store::testing::GlobalStoreClientStub; + +class NoopGlobalStoreClient final : public GlobalStoreClientStub { + public: + absl::Status unregister_replica(std::string_view, std::string_view) override { + return absl::OkStatus(); + } + + absl::StatusOr mark_replica_unavailable( + std::string_view, + std::string_view, + std::optional, + std::optional) override { + return true; + } + + absl::StatusOr wait_replica_drain(std::string_view, uint32_t, std::optional) + override { + return ReplicaDrainStatus{.drained = true}; + } +}; + +StoreEngineOptions make_engine_opts(const std::filesystem::path& temp_root) { + StoreEngineOptions opts; + opts.storage_path = temp_root.string(); + opts.memory_pool_size = 32ULL * 1024 * 1024; + opts.tx_slice_bytes = 1ULL << 20; + opts.num_thread = 2; + opts.pinned_memory_timeout = std::chrono::milliseconds(0); + tensorcast::store::MemoryTierConfig tiers; + tiers.stable_bytes = 1ULL << 20; + opts.memory_tier_config = tiers; + return opts; +} + +struct ManagerHarness { + explicit ManagerHarness(const DerivedViewExportManager::Options& options) + : temp_root( + std::filesystem::temp_directory_path() / + absl::StrCat("derived-view-manager-", absl::ToUnixMicros(absl::Now()))), + engine(std::make_shared(make_engine_opts(temp_root))), + sessions(std::chrono::seconds(60)), + lip(engine, nullptr), + lifecycle(sessions, refs, lip), + manager(*engine, lifecycle, options) { + std::filesystem::create_directories(temp_root); + manager.set_global_store_client(std::make_shared()); + } + + ~ManagerHarness() { + std::error_code ec; + std::filesystem::remove_all(temp_root, ec); + } + + std::filesystem::path temp_root; + std::shared_ptr engine; + ReplicaSessionManager sessions; + RefTracker refs; + tensorcast::daemon::LipManager lip; + SessionLifecycleManager lifecycle; + DerivedViewExportManager manager; +}; + +ReplicaKey make_view_key(std::string artifact_id, std::string view_id, int device_ordinal = 0, uint32_t replica = 0) { + return ReplicaKey{ + .artifact_id = std::move(artifact_id), + .view_id = std::optional(std::move(view_id)), + .device = DeviceRegistry::instance().gpu_key(device_ordinal), + .replica = replica, + }; +} + +std::string make_index_json(int64_t numel) { + const uint64_t size_bytes = static_cast(numel) * sizeof(float); + return absl::StrCat("{\"x\":[0,", size_bytes, ",[", numel, "],[1],\"torch.float32\",0]}"); +} + +ViewSpec make_full_view_spec(int64_t numel) { + ViewSpec spec; + TensorViewOps ops; + ops.ops.push_back(ViewOp::Narrow(NarrowOp{.dim = 0, .start = 0, .length = static_cast(numel)})); + spec.tensors.emplace("x", std::move(ops)); + return spec; +} + +ReplicaKey register_view_replica( + StoreEngine& engine, + std::string artifact_id, + std::string view_id, + int device_ordinal) { + auto reg = StoreEngine::ArtifactRegistration{}; + reg.artifact_id = artifact_id; + reg.tensor_index_data = make_index_json(/*numel=*/2); + reg.schema_version = "v3"; + reg.encoding = "json"; + reg.device_id = device_ordinal; + reg.total_size_bytes = 2 * sizeof(float); + + ViewRegistration view; + view.view_id = view_id; + view.spec = make_full_view_spec(/*numel=*/2); + view.placement = ViewPlacement::kClient; + view.canonical_size_bytes = reg.total_size_bytes; + view.registration_kind = ViewRegistrationKind::kCanonical; + reg.view = std::move(view); + + auto begin_or = engine.begin_register_artifact(reg); + REQUIRE(begin_or.ok()); + auto commit_or = engine.commit_registered_artifact(begin_or->registration_id); + REQUIRE(commit_or.ok()); + REQUIRE(commit_or->view_id.has_value()); + REQUIRE(*commit_or->view_id == view_id); + return make_view_key(commit_or->artifact_id, std::move(view_id), device_ordinal); +} + +DerivedViewExportManager::EntrySnapshot require_entry(DerivedViewExportManager& manager, const ReplicaKey& key) { + auto snapshot = manager.find_entry(key); + REQUIRE(snapshot.has_value()); + return *snapshot; +} + +} // namespace + +TEST_CASE("DerivedViewExportManager reuses identical cache keys", "[daemon][derived_view_export_manager][reuse]") { + DerivedViewExportManager::Options options; + options.ttl = absl::Seconds(2); + options.budget_override_bytes = 1ULL << 20; + options.headroom_override_bytes = 0; + ManagerHarness harness(options); + + const ReplicaKey key = register_view_replica(*harness.engine, "artifact-a", "view-a", 0); + REQUIRE(harness.manager.retain_or_refresh(key).ok()); + const auto first = require_entry(harness.manager, key); + + REQUIRE(harness.manager.retain_or_refresh(key).ok()); + const auto reused = require_entry(harness.manager, key); + REQUIRE(reused.generation == first.generation); + REQUIRE(reused.use_lease_id == first.use_lease_id); + REQUIRE(reused.retention_lease_id == first.retention_lease_id); + REQUIRE(reused.expiry_time >= first.expiry_time); + + const ReplicaKey different_view = register_view_replica(*harness.engine, "artifact-a", "view-b", 0); + REQUIRE(harness.manager.retain_or_refresh(different_view).ok()); + const auto different_view_entry = require_entry(harness.manager, different_view); + REQUIRE(different_view_entry.generation != first.generation); + + const ReplicaKey different_device = register_view_replica(*harness.engine, "artifact-a", "view-a", 1); + REQUIRE(harness.manager.retain_or_refresh(different_device).ok()); + const auto different_device_entry = require_entry(harness.manager, different_device); + REQUIRE(different_device_entry.generation != first.generation); +} + +TEST_CASE( + "DerivedViewExportManager refreshes TTL only on data-plane use", + "[daemon][derived_view_export_manager][ttl]") { + DerivedViewExportManager::Options options; + options.ttl = absl::Milliseconds(60); + options.budget_override_bytes = 1ULL << 20; + options.headroom_override_bytes = 0; + ManagerHarness harness(options); + + const ReplicaKey key = register_view_replica(*harness.engine, "artifact-ttl", "view-ttl", 0); + REQUIRE(harness.manager.retain_or_refresh(key).ok()); + const auto initial = require_entry(harness.manager, key); + + absl::SleepFor(absl::Milliseconds(20)); + const auto after_lookup = require_entry(harness.manager, key); + REQUIRE(after_lookup.expiry_time == initial.expiry_time); + + absl::SleepFor(absl::Milliseconds(20)); + REQUIRE(harness.manager.begin_fetch(key, "fetch-1").ok()); + const auto during_fetch = require_entry(harness.manager, key); + REQUIRE(during_fetch.active_fetches == 1); + REQUIRE(during_fetch.expiry_time > initial.expiry_time); + + harness.manager.end_fetch("fetch-1", "test_complete"); + const auto after_fetch = require_entry(harness.manager, key); + REQUIRE(after_fetch.active_fetches == 0); + REQUIRE(after_fetch.expiry_time == during_fetch.expiry_time); + + absl::SleepFor(absl::Milliseconds(30)); + harness.lifecycle.expire_due(absl::Now()); + REQUIRE(harness.manager.find_entry(key).has_value()); + + absl::SleepFor(absl::Milliseconds(40)); + harness.lifecycle.expire_due(absl::Now()); + REQUIRE_FALSE(harness.manager.find_entry(key).has_value()); +} diff --git a/daemon/state/sweep_tasks.h b/daemon/state/sweep_tasks.h index e3cd1aff3..c77cdf51f 100644 --- a/daemon/state/sweep_tasks.h +++ b/daemon/state/sweep_tasks.h @@ -18,6 +18,7 @@ #include "absl/time/clock.h" #include "core/store/device_registry.h" #include "core/store/store_engine.h" +#include "daemon/state/derived_view_export_manager.h" #include "daemon/state/ipc_region_registry.h" #include "daemon/state/ref_tracker.h" #include "daemon/state/session_lifecycle.h" @@ -38,13 +39,17 @@ class IBackgroundTask { class LockTtlTask final : public IBackgroundTask { public: - LockTtlTask(TransportLockManager& locks, store::StoreEngine& engine) : locks_(locks), engine_(engine) {} + LockTtlTask(TransportLockManager& locks, store::StoreEngine& engine, DerivedViewExportManager* derived_view_exports) + : locks_(locks), engine_(engine), derived_view_exports_(derived_view_exports) {} void run_once() override { for (const auto& tok : locks_.tokens()) { auto expired = locks_.remove_if_expired(tok); if (expired.has_value()) { // UMA final: no engine-level unlock; TTL sweep clears daemon bookkeeping only. + if (expired->key.view_id.has_value() && derived_view_exports_ != nullptr) { + derived_view_exports_->end_fetch(tok, "lock_ttl_expired"); + } VLOG(1) << "LockTtlTask: expired transport lock cleared for artifact_id=" << expired->key.artifact_id; } } @@ -57,6 +62,7 @@ class LockTtlTask final : public IBackgroundTask { private: TransportLockManager& locks_; [[maybe_unused]] store::StoreEngine& engine_; + DerivedViewExportManager* derived_view_exports_{nullptr}; }; class RegionRegistrySweepTask final : public IBackgroundTask { diff --git a/daemon/state/transport_lock_manager.h b/daemon/state/transport_lock_manager.h index 4268176b8..6ae3033e1 100644 --- a/daemon/state/transport_lock_manager.h +++ b/daemon/state/transport_lock_manager.h @@ -55,6 +55,17 @@ class TransportLockManager { return locks_.erase(token) > 0; } + std::optional take(const std::string& token) { + absl::MutexLock l(&mu_); + auto it = locks_.find(token); + if (it == locks_.end()) { + return std::nullopt; + } + LockEntry entry = it->second; + locks_.erase(it); + return entry; + } + bool has_lock_for_key(const store::loading::ReplicaKey& key) const { absl::MutexLock l(&mu_); for (const auto& entry : locks_) { diff --git a/daemon/testing/daemon_service_harness.cc b/daemon/testing/daemon_service_harness.cc index 1e48c117d..29dab9a6a 100644 --- a/daemon/testing/daemon_service_harness.cc +++ b/daemon/testing/daemon_service_harness.cc @@ -125,6 +125,7 @@ absl::StatusOr> DaemonServiceHarness::crea kernel->persistence_manager()->set_global_store_client(global_store_client.get()); } kernel->lip_manager().set_global_store_client(global_store_client); + kernel->derived_view_export_manager().set_global_store_client(global_store_client); } MaterializationController::Dep mdep{ @@ -141,6 +142,7 @@ absl::StatusOr> DaemonServiceHarness::crea .identity = kernel->worker_identity_store(), .global_store_client = global_store_client, .lifecycle = &kernel->lifecycle_manager(), + .derived_view_exports = &kernel->derived_view_export_manager(), .handle_leases = kernel->handle_leases(), .capability_tokens = kernel->capability_tokens(), .cpu_shared_memory_enabled = options.cpu_shared_memory_enabled, @@ -162,7 +164,11 @@ absl::StatusOr> DaemonServiceHarness::crea auto registration_controller = std::make_unique(rdep); TransportController::Dep tdep{ - .engine = kernel->engine(), .locks = kernel->transport_lock_manager(), .lip = kernel->lip_manager()}; + .engine = kernel->engine(), + .locks = kernel->transport_lock_manager(), + .lip = kernel->lip_manager(), + .derived_view_exports = &kernel->derived_view_export_manager(), + }; auto transport_controller = std::make_unique(tdep); StatusController::Dep sdep{ diff --git a/docs/plans/0086-source-side-remote-view-transport.md b/docs/plans/0086-source-side-remote-view-transport.md new file mode 100644 index 000000000..d3cedc4a0 --- /dev/null +++ b/docs/plans/0086-source-side-remote-view-transport.md @@ -0,0 +1,192 @@ +--- +slug: source-side-remote-view-transport +title: Source-Side Remote View Transport Plan +status: in_progress +areas: ["global_store", "core", "daemon", "proto", "tests", "docs"] +created: 2026-03-15 +last_updated: 2026-03-18 +related_code: + - docs/designs/0086-source-side-remote-view-transport.md + - docs/architecture/p2p-transfer-strategies.md + - proto/tensorcast/global_store/v1/global_store.proto + - tensorcast/global_store/rpc/transport_rpc_handler.py + - tensorcast/global_store/services/transport_service.py + - tensorcast/global_store/repositories/replica_repository.py + - core/store/components/global_store_client.h + - core/store/components/global_store_client.cc + - core/store/materialization/control/materialize_orchestrator.cc + - core/store/runtime/metadata/metadata_gateway.cc + - daemon/service/controllers/replica_materialization_service.cc + - daemon/state/lip_manager.cc + - daemon/state/types.h + - schema.sql +links: + design: ../designs/0086-source-side-remote-view-transport.md +--- + +# Objective + +Implement view-aware remote transport so `request_view_transport(view)` can route either: + +- an already-resident dense view replica; or +- a canonical source daemon that can derive and export the requested dense view byte-space on demand. + +The target outcome is to eliminate destination-side canonical reconstruction for remote TP-sliced loads while preserving TensorCast view semantics and mixed-version correctness. + +# Current State and Root Causes + +Current behavior has four blocking gaps: + +- view residency publication is best-effort because `record_view_residency` is still effectively a no-op for Global Store routing; +- `request_view_transport` only routes already-routable view sources; +- lookup miss falls back to canonical transport, so the destination daemon reconstructs the view locally; +- receiver-side canonical fallback reintroduces read amplification and strided repack on remote TP loads. + +# Phases and Milestones + +- [x] Phase 0: Contract Grounding + - [x] Milestone 0.1: Lock the route kinds and transport semantics: `resident_view`, `derived_view_from_canonical`, `canonical_fallback`. + - [x] Milestone 0.2: Define the minimal view residency record needed for routing, including `artifact_id`, `view_id`, `view_size_bytes`, optional `view_data_hash`, source identity, and placement. + - [x] Milestone 0.3: Define lifecycle rules for resident views versus ephemeral transport-scoped derived views. + +- [x] Phase 1: First-Class View Residency in Global Store + - [x] Milestone 1.1: Implement Global Store persistence and query for view residency so `record_view_residency` becomes routable state. + - [x] Milestone 1.2: Plumb view residency publication from daemon completion and registration paths. + - [x] Milestone 1.3: Update `request_view_transport(view)` to route already-resident view sources before canonical fallback. + +- [x] Phase 2: Route Model and Compatibility + - [x] Milestone 2.1: Extend the transport response model so route kind is explicit and observable. + - [x] Milestone 2.2: Carry view-scoped metadata needed for integrity and debugging, including size and optional hash semantics. + - [x] Milestone 2.3: Keep mixed-version compatibility by retaining canonical fallback when source, destination, or Global Store lacks view-aware capability. + +- [x] Phase 3: Source-Side Derived View Export + - [x] Milestone 3.1: Extend view transport lookup into lookup-or-derive over eligible canonical sources. + - [x] Milestone 3.2: Implement source-daemon derive-on-demand export that reuses existing local view dataplane primitives. + - [x] Milestone 3.3: Bind derived-view exports to transport lease or TTL cleanup without turning them into durable global replicas by default. + +- [x] Phase 4: Destination Daemon Integration + - [x] Milestone 4.1: Teach the destination daemon to ingest dense view transport directly without canonical reconstruction. + - [x] Milestone 4.2: Preserve the existing local materialize, register, and requester-export behavior after ingest completes. + - [x] Milestone 4.3: Keep canonical fallback functional and correct as the last compatibility path. + +- [x] Phase 5: Verification and Correctness + - [x] Milestone 5.1: Ensure view payloads use view-scoped verification semantics and never reuse canonical verification incorrectly. + - [x] Milestone 5.2: Validate `need_view_data_hash=false` behavior so skipped view hashing does not weaken byte-space correctness. + - [x] Milestone 5.3: Add end-to-end correctness tests for resident-view routing, derive-on-demand routing, and canonical fallback. + +- [x] Phase 6: Benchmark and Acceptance + - [x] Milestone 6.1: Re-run remote TP load benchmarks and verify receiver-side amplification and pack disappear on routed view transport. + - [x] Milestone 6.2: Compare `resident_view`, `derived_view_from_canonical`, and `canonical_fallback` route behavior with route-kind observability enabled. + - [x] Milestone 6.3: Update architecture and benchmark docs with before/after results and rollout guidance. + +- [ ] Phase 7: Ephemeral Derived-View Lifecycle Hardening + - Note: the first slow `tp=4` remote-update regression came from over-conservative prepare admission rather than true budget exhaustion. Phase 7 hardening must keep fallback tied to real admission failure, not to an arbitrary serialization gate. + - [ ] Milestone 7.1: Introduce daemon-owned derived-view entry state keyed by `(artifact_id, view_id, device)` so source-side exports are tracked as reusable ephemeral residents rather than one-shot transport cleanup artifacts. + - [x] Task 7.1a: Define an explicit derived-view entry record with lifecycle state, local export handle, routing identity, TTL deadline, and `active_fetches`. + - [ ] Task 7.1b: Add a daemon-local index keyed by `(artifact_id, view_id, device)` and a secondary lookup by exported replica identity for cleanup and observability. + - [x] Task 7.1c: Split entry states at least into `preparing`, `ready`, and `draining`, with invariants documented at the manager boundary. + - [ ] Task 7.1d: Make concurrent requests for the same key coalesce onto one prepare path instead of exporting duplicate dense views. + - [x] Milestone 7.2: Implement sliding TTL semantics for derived views, with refresh only on successful data-plane use and not on control-plane probes or route lookup. + - [x] Task 7.2a: Add a configurable TTL for source-side derived views with a conservative default suitable for repeated model-load/update bursts. + - [x] Task 7.2b: Refresh TTL on successful data-plane admission/use only, not on `RequestViewTransport` lookup hits or other control-plane reads. + - [x] Task 7.2c: Define how `preparing` entries age, including whether they are protected from expiry until first successful publish/use. + - [x] Milestone 7.3: Implement pressure-aware admission and eviction for derived views so expired idle entries are reclaimed first, then oldest idle non-expired entries, while `preparing` and `active_fetches>0` entries remain protected. + - [x] Task 7.3a: Track derived-view residency against the source daemon memory budget separately from durable canonical residency. + - [x] Task 7.3b: Add an admission check before prepare/publish that can reclaim eligible entries before creating a new dense view. + - [x] Task 7.3c: Implement eviction ordering: expired idle first, then oldest idle non-expired, never `preparing`, never `active_fetches>0`. + - [x] Task 7.3d: Define the fallback policy when admission still fails after eviction: return canonical fallback rather than overcommitting source-daemon memory. + - [x] Milestone 7.4: Implement ordered retirement semantics on the source daemon: mark draining, withdraw Global Store route, wait for in-flight fetches to drain, unregister export state, then release local memory. + - [x] Task 7.4a: Introduce an explicit drain transition that blocks new fetches before local export teardown starts. + - [x] Task 7.4b: Make Global Store route withdrawal happen before local export release so stale-route fetches cannot observe a dead endpoint. + - [x] Task 7.4c: Wait for `active_fetches==0` before unregistering export state and releasing local backing memory. + - [x] Task 7.4d: Ensure exceptional cleanup and normal TTL/pressure retirement share the same ordered teardown path. + - [x] Milestone 7.5: Ensure canonical fallback remains the compatibility and resource-exhaustion backstop when derived-view admission cannot safely proceed. + - [x] Task 7.5a: Keep route-kind observability explicit so `derived_view_from_canonical` and `canonical_fallback` remain distinguishable in logs and metrics. + - [x] Task 7.5b: Preserve current compatibility fallback when source, destination, or Global Store lacks the needed view-aware capability. + - [x] Task 7.5c: Add explicit fallback reasons for admission failure, route invalidation, and lifecycle-race handling. + - [x] Milestone 7.6: Add lifecycle metrics and logs for creation, reuse hit, TTL refresh, drain, eviction, and fallback reason so repeated remote update/load workloads are diagnosable. + - [x] Task 7.6a: Add counters or structured logs for create/reuse/refresh/expire/evict/drain/fallback events. + - [x] Task 7.6b: Make logs include `(artifact_id, view_id, device)` and route kind so cross-daemon correlation is possible during remote benchmarks. + - [x] Task 7.6c: Add enough observability to explain repeated-trial behavior such as “trial 2/3 reused source-side dense view from trial 1”. + - [ ] Milestone 7.7: Add focused tests: + - [x] Test 7.7a: unit test for derived-view cache keying and reuse, proving repeated identical requests hit the same source-side ephemeral export within TTL. + - [x] Test 7.7b: unit test for sliding TTL refresh semantics, proving data-plane use extends lifetime while pure control-plane observation does not. + - [x] Test 7.7c: unit test for eviction ordering, proving expired idle entries evict before non-expired idle entries and active/preparing entries are never reclaimed. + - [ ] Test 7.7d: integration test for ordered retirement, proving route withdrawal happens before local export teardown and prevents stale-route connection failures. + - [x] Test 7.7e: integration test for repeated multi-version remote update/load workloads, proving source-side derived views do not accumulate unbounded DRAM and later trials continue to succeed. + - [x] Test 7.7f: regression test for fallback behavior under forced admission pressure, proving the system degrades to canonical transport without semantic breakage. + +- [x] Phase 8: Deadline and Wait-Budget Semantics for Source-Side Upgrade + - Note: the current remote-update failures show that source-side derived-view upgrade still inherits an internal ~30s wait budget from `pinned_allocation_timeout_ms`, even when the caller allows a much larger end-to-end deadline. This phase makes caller deadline authoritative and stops using pinned-allocation timeout as a distributed upgrade deadline. + - [x] Milestone 8.1: Re-establish authoritative request-budget semantics for remote materialization and source-side upgrade. + - [x] Task 8.1a: Change daemon-side materialization request budget derivation so it uses the incoming gRPC deadline as the authoritative request budget, instead of deriving request budget from `pinned_allocation_timeout_ms`. + - [x] Task 8.1b: Keep `pinned_allocation_timeout_ms` scoped to local pinned/staging allocation waits only; do not copy it into `request_budget` or `transport_wait_timeout`. + - [x] Task 8.1c: Make source-side upgrade wait budget derive from remaining authoritative request budget, so daemon B may keep waiting for daemon A's derived-view export until the caller deadline is nearly exhausted. + - [x] Milestone 8.2: Make timeout and fallback semantics principled. + - [x] Task 8.2a: Preserve canonical fallback for compatibility, route-unavailable, lifecycle-race, and admission/resource failures. + - [x] Task 8.2b: Stop doing late canonical fallback when source-side upgrade merely exhausts the remaining request deadline; propagate timeout instead. + - [x] Task 8.2c: Ensure logs and status reasons distinguish `admission/resource fallback` from `deadline exhausted while waiting for source-side export readiness`. + - [x] Milestone 8.3: Add observability for budget propagation and upgrade waiting. + - [x] Task 8.3a: Log caller request budget, gRPC deadline remaining, pinned timeout, transport wait timeout, and source-side prepare wait budget at materialization entry / upgrade decision points. + - [x] Task 8.3b: Add route-selection / fallback logs that make it obvious whether a request stayed on `resident_view`, downgraded to canonical transport, or terminated due to deadline exhaustion. + - [x] Milestone 8.4: Validate the new semantics against the known regression cases. + - [x] Task 8.4a: Re-run `update_weight_remote` for `qwen3-32b tp=2/4` and confirm source-side upgrade no longer falls back after an accidental internal ~30s wait-budget cap. + - [x] Task 8.4b: Re-run `load_weight_remote` and confirm the new request-budget semantics do not regress the existing relay fast path. + +# Suggested Implementation Order for Phase 7 + +- [x] Step 7.A: Add the daemon-local derived-view entry manager and state machine. +- [x] Step 7.B: Move current source-side export registration onto the new manager so repeated requests reuse a stable in-memory entry instead of one-shot cleanup state. +- [x] Step 7.C: Add `active_fetches` accounting and drain semantics first, before TTL or eviction, so teardown ordering is correct. + - Scope note: relay hot-path completion requires transport-session-backed source-side fetch accounting via daemon-to-daemon `BeginReplicaFetch` / `EndReplicaFetch`; daemon lock/unlock alone is not sufficient. +- [x] Step 7.D: Add sliding TTL refresh on data-plane use. +- [x] Step 7.E: Add admission plus eviction logic and wire it into prepare-time decisions. +- [ ] Step 7.F: Replace ad hoc cleanup paths with one ordered retirement path shared by TTL expiry, pressure eviction, and exceptional cleanup. +- [x] Step 7.G: Add observability for reuse, drain, eviction, and fallback reason. +- [x] Step 7.H: Land unit tests for cache keying, TTL refresh, and eviction ordering. +- [ ] Step 7.I: Land integration tests for ordered retirement and repeated multi-version remote update/load stability. + +# Validation Matrix + +Global Store and daemon control plane: + +- view residency publish/query behavior +- route-kind response correctness +- mixed-version canonical fallback + +Data path correctness: + +- resident dense view transport matches requested `view_id` +- derived dense view transport matches requested `view_id` +- canonical fallback remains functionally unchanged + +Lifecycle correctness: + +- derived-view TTL reuse works across repeated fetches of the same `(artifact_id, view_id, device)` +- derived-view entries are not retained indefinitely after TTL expiry or pressure eviction +- stale resident-view routes are withdrawn before local export release +- active fetches are never torn down underneath an in-flight transfer + +Benchmark acceptance: + +- remote TP relay no longer shows receiver-side canonical amplification for routed view transport +- destination-side repack time is eliminated or near-zero on the routed path +- transport time converges toward source-side derivation plus wire transfer, not destination-side reconstruction +- repeated remote update/load trials do not regress due to leaked source-side ephemeral views + +# Primary Risks + +- routing stale or inconsistent view residency: mitigate with explicit lifecycle rules and lease-scoped cleanup. +- semantic drift between resident view and derived view transport: mitigate with shared view metadata contract and end-to-end byte-space tests. +- mixed-version rollout ambiguity: mitigate with explicit route kind and capability-gated canonical fallback. +- over-promoting ephemeral derived views into durable state: mitigate by keeping derive-on-demand export daemon-owned, TTL-scoped, and pressure-evictable by default. +- stale source-side routes after local export teardown: mitigate with ordered drain and route-withdraw-first retirement. +- repeated multi-version workloads exhausting source-daemon DRAM: mitigate with bounded ephemeral residency plus eviction/admission control. + +# Owner Checklist + +- [x] Design doc and plan stay aligned during implementation. +- [x] Proto and schema changes remain additive and compatibility-safe. +- [x] Route kind is explicit in metrics and logs. +- [x] Benchmark evidence demonstrates the intended TP>1 remote load improvement. +- [x] Lifecycle evidence demonstrates repeated remote load/update trials reuse derived views when hot and reclaim them when cold. diff --git a/proto/tensorcast/config/v1/daemon_config.proto b/proto/tensorcast/config/v1/daemon_config.proto index 33edf16c8..9424eceec 100644 --- a/proto/tensorcast/config/v1/daemon_config.proto +++ b/proto/tensorcast/config/v1/daemon_config.proto @@ -71,6 +71,17 @@ message Lifecycle { uint32 max_mints_per_second = 3; } HandleLeases handle_leases = 20; + + message DerivedViewExports { + // Sliding TTL for daemon-owned source-side derived view exports. + // This governs how long an idle derived view stays reusable before + // source-side retirement begins. + google.protobuf.Duration ttl = 1; + // Retry TTL used when an ordered retire attempt fails and the daemon + // needs a bounded backoff window before trying teardown again. + google.protobuf.Duration retry_retire_ttl = 2; + } + DerivedViewExports derived_view_exports = 21; } message HighAvailability { @@ -163,6 +174,12 @@ message PinnedMemory { reserved 4; reserved "min_bytes", "max_bytes"; bool rdma_preregister = 5; + // Best-effort NUMA node binding for the backing pinned slab before + // cudaHostRegister. Leave unset/-1 to keep the OS default policy. + int32 numa_node = 6; + // If true and numa_node >= 0, prefault the slab before cudaHostRegister so + // placement/fault cost is paid at daemon startup rather than on first use. + bool numa_prefault = 7; } reserved 1; diff --git a/proto/tensorcast/daemon/v2/store_daemon.proto b/proto/tensorcast/daemon/v2/store_daemon.proto index 2d4fa189d..608a05c2f 100644 --- a/proto/tensorcast/daemon/v2/store_daemon.proto +++ b/proto/tensorcast/daemon/v2/store_daemon.proto @@ -42,6 +42,8 @@ service StoreDaemonService { // Dual-end locking mechanism for P2P transfers rpc LockTransportChunks(LockTransportChunksRequest) returns (LockTransportChunksResponse) {} rpc UnlockTransportChunks(UnlockTransportChunksRequest) returns (UnlockTransportChunksResponse) {} + rpc BeginReplicaFetch(BeginReplicaFetchRequest) returns (BeginReplicaFetchResponse) {} + rpc EndReplicaFetch(EndReplicaFetchRequest) returns (EndReplicaFetchResponse) {} // ========== Memory Artifact Registration ========== // New canonical RPC names @@ -723,6 +725,27 @@ message UnlockTransportChunksResponse { // Empty response, returns empty message on success } +message BeginReplicaFetchRequest { + string transport_id = 1; + string artifact_id = 2; + string view_id = 3; + DeviceType device_type = 4; + optional int32 device_id = 5; +} + +message BeginReplicaFetchResponse { + bool managed = 1; +} + +message EndReplicaFetchRequest { + string transport_id = 1; + optional string reason = 2; +} + +message EndReplicaFetchResponse { + bool managed = 1; +} + // ========== Memory Artifact Registration Messages ========== // Begin registration request diff --git a/proto/tensorcast/global_store/v1/global_store.proto b/proto/tensorcast/global_store/v1/global_store.proto index 95dfe5da1..79142beac 100644 --- a/proto/tensorcast/global_store/v1/global_store.proto +++ b/proto/tensorcast/global_store/v1/global_store.proto @@ -788,6 +788,22 @@ message RequestReplicaTransportResponse { Status status = 1; tensorcast.common.v1.MemoryInfo remote_memory_info = 2; string transport_id = 3; + TransportRouteKind route_kind = 4; + ViewTransportMetadata view_transport_metadata = 5; + uint32 source_grpc_port = 6; +} + +enum TransportRouteKind { + TRANSPORT_ROUTE_KIND_UNSPECIFIED = 0; + TRANSPORT_ROUTE_KIND_CANONICAL = 1; + TRANSPORT_ROUTE_KIND_RESIDENT_VIEW = 2; + TRANSPORT_ROUTE_KIND_DERIVED_VIEW_FROM_CANONICAL = 3; +} + +message ViewTransportMetadata { + string view_id = 1; + uint64 view_size_bytes = 2; + optional string view_data_hash = 3; } message TransportSchedulingGroup { diff --git a/tensorcast/global_store/grpc_service.py b/tensorcast/global_store/grpc_service.py index 28b2f72f5..412ce248a 100644 --- a/tensorcast/global_store/grpc_service.py +++ b/tensorcast/global_store/grpc_service.py @@ -441,6 +441,8 @@ def _rebuild_runtime_services_and_handlers(self) -> None: ) self.transport_rpc_handler = TransportRpcHandler( transport_service=self.transport_service, + view_repository=self.view_repository, + worker_repository=self.worker_repository, replica_to_memory_info=self._replica_to_memory_info, logger=logger, ) diff --git a/tensorcast/global_store/repositories/view_repository.py b/tensorcast/global_store/repositories/view_repository.py index c5b3a59db..13a092b9c 100644 --- a/tensorcast/global_store/repositories/view_repository.py +++ b/tensorcast/global_store/repositories/view_repository.py @@ -56,12 +56,15 @@ def upsert( canonical_bytes_covered ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT (artifact_id, view_id) DO UPDATE SET - view_spec_json = EXCLUDED.view_spec_json, + view_spec_json = CASE + WHEN EXCLUDED.view_spec_json <> '' THEN EXCLUDED.view_spec_json + ELSE views.view_spec_json + END, view_size = EXCLUDED.view_size, - view_data_hash = EXCLUDED.view_data_hash, - verified_at = EXCLUDED.verified_at, - canonical_size_bytes = EXCLUDED.canonical_size_bytes, - canonical_bytes_covered = EXCLUDED.canonical_bytes_covered + view_data_hash = COALESCE(EXCLUDED.view_data_hash, views.view_data_hash), + verified_at = COALESCE(EXCLUDED.verified_at, views.verified_at), + canonical_size_bytes = COALESCE(EXCLUDED.canonical_size_bytes, views.canonical_size_bytes), + canonical_bytes_covered = COALESCE(EXCLUDED.canonical_bytes_covered, views.canonical_bytes_covered) """, params, ) diff --git a/tensorcast/global_store/repositories/worker_repository.py b/tensorcast/global_store/repositories/worker_repository.py index ac9801f18..9cdc7ce6e 100644 --- a/tensorcast/global_store/repositories/worker_repository.py +++ b/tensorcast/global_store/repositories/worker_repository.py @@ -1315,6 +1315,25 @@ def find_by_node_id(self, node_id: str) -> Worker | None: finally: cursor.close() + def find_by_worker_id(self, worker_id: str) -> Worker | None: + """Find an active worker by worker_id.""" + with self._write_lock: + cursor = self.get_cursor() + try: + row = cursor.execute( + f""" + {_WORKER_SELECT} + WHERE workers.worker_id = ? AND workers.inactive_at IS NULL + LIMIT 1 + """, + [worker_id], + ).fetchone() + if row: + return self._row_to_model(row) + return None + finally: + cursor.close() + def list_active(self, accepting_only: bool = False) -> list[Worker]: """Return active workers, optionally filtering by accepting status.""" # Map accepting_only flag to existing include_unavailable parameter diff --git a/tensorcast/global_store/rpc/transport_rpc_handler.py b/tensorcast/global_store/rpc/transport_rpc_handler.py index 9e5852bce..cf939f23c 100644 --- a/tensorcast/global_store/rpc/transport_rpc_handler.py +++ b/tensorcast/global_store/rpc/transport_rpc_handler.py @@ -21,6 +21,8 @@ TransportCompletionOutcome, TransportSchedulingGroup, ) +from tensorcast.global_store.repositories.view_repository import ViewRepository +from tensorcast.global_store.repositories.worker_repository import WorkerRepository from tensorcast.global_store.services.transport_service import TransportService from tensorcast.observability.otel import set_span_attributes from tensorcast.proto.common.v1 import common_pb2 @@ -34,10 +36,14 @@ def __init__( self, *, transport_service: TransportService, + view_repository: ViewRepository, + worker_repository: WorkerRepository, replica_to_memory_info: Callable[[Replica], common_pb2.MemoryInfo], logger, ) -> None: self._transport_service = transport_service + self._view_repository = view_repository + self._worker_repository = worker_repository self._replica_to_memory_info = replica_to_memory_info self._logger = logger @@ -138,17 +144,41 @@ def request_replica_transport( ) remote_info = self._replica_to_memory_info(replica) + route_kind = self._route_kind_for_selection( + requested_view_id=requested_view_id, + replica=replica, + ) + view_transport_metadata = self._build_view_transport_metadata( + artifact_id=request.artifact_id, + requested_view_id=requested_view_id, + replica=replica, + ) from contextlib import suppress with suppress(Exception): set_span_attributes({"tc.transport.id": str(transport_id)}) + set_span_attributes({"tc.transport.route_kind": int(route_kind)}) + if view_transport_metadata is not None: + set_span_attributes( + { + "tc.transport.view_id": view_transport_metadata.view_id, + "tc.transport.view_size_bytes": int( + view_transport_metadata.view_size_bytes + ), + } + ) - return global_store_pb2.RequestReplicaTransportResponse( + response = global_store_pb2.RequestReplicaTransportResponse( status=global_store_pb2.Status.STATUS_OK, remote_memory_info=remote_info, transport_id=str(transport_id), + route_kind=route_kind, + source_grpc_port=self._resolve_source_grpc_port(replica), ) + if view_transport_metadata is not None: + response.view_transport_metadata.CopyFrom(view_transport_metadata) + return response except NotFoundError: self._logger.info( @@ -180,6 +210,63 @@ def request_replica_transport( status=global_store_pb2.Status.STATUS_ERROR ) + @staticmethod + def _route_kind_for_selection( + *, requested_view_id: str | None, replica: Replica + ) -> global_store_pb2.TransportRouteKind: + if ( + requested_view_id + and replica.byte_space.kind.name == "VIEW" + and replica.byte_space.id == requested_view_id + ): + return ( + global_store_pb2.TransportRouteKind.TRANSPORT_ROUTE_KIND_RESIDENT_VIEW + ) + if requested_view_id: + return global_store_pb2.TransportRouteKind.TRANSPORT_ROUTE_KIND_DERIVED_VIEW_FROM_CANONICAL + return global_store_pb2.TransportRouteKind.TRANSPORT_ROUTE_KIND_CANONICAL + + def _build_view_transport_metadata( + self, *, artifact_id: str, requested_view_id: str | None, replica: Replica + ) -> global_store_pb2.ViewTransportMetadata | None: + metadata_view_id = requested_view_id + if metadata_view_id is None and ( + replica.byte_space.kind.name == "VIEW" and replica.byte_space.id is not None + ): + metadata_view_id = replica.byte_space.id + if metadata_view_id is None: + return None + view_row = self._view_repository.get( + artifact_id=artifact_id, + view_id=metadata_view_id, + ) + if view_row is None: + if ( + replica.byte_space.kind.name != "VIEW" + or replica.byte_space.id != metadata_view_id + ): + return None + return global_store_pb2.ViewTransportMetadata( + view_id=metadata_view_id, + view_size_bytes=int(replica.memory_size), + ) + metadata = global_store_pb2.ViewTransportMetadata( + view_id=metadata_view_id, + view_size_bytes=int(view_row["view_size"]), + ) + view_data_hash = view_row.get("view_data_hash") + if view_data_hash: + metadata.view_data_hash = str(view_data_hash) + return metadata + + def _resolve_source_grpc_port(self, replica: Replica) -> int: + if replica.worker_id is None or not replica.worker_id: + return 0 + worker = self._worker_repository.find_by_worker_id(replica.worker_id) + if worker is None: + return 0 + return int(worker.grpc_port) + def complete_replica_transport( self, request: global_store_pb2.CompleteReplicaTransportRequest, diff --git a/tensorcast/global_store/rpc/view_proof_rpc_handler.py b/tensorcast/global_store/rpc/view_proof_rpc_handler.py index ca07a6f71..9d64e8436 100644 --- a/tensorcast/global_store/rpc/view_proof_rpc_handler.py +++ b/tensorcast/global_store/rpc/view_proof_rpc_handler.py @@ -136,8 +136,6 @@ def update_artifact_view_state( view = request.view if not view.view_id: raise ValidationError("view.view_id is required") - if not view.view_spec_json: - raise ValidationError("view.view_spec_json is required") if view.view_size <= 0: raise ValidationError("view.view_size must be positive") verified_at = ( diff --git a/tensorcast/global_store/services/transport_service.py b/tensorcast/global_store/services/transport_service.py index 990ba14a9..925d332ab 100644 --- a/tensorcast/global_store/services/transport_service.py +++ b/tensorcast/global_store/services/transport_service.py @@ -94,6 +94,44 @@ def _source_balance_weights(self) -> SourceBalanceWeights: diffusion_bonus=float(policy.diffusion_bonus_weight), ) + def _has_any_transport_route( + self, *, artifact_id: str, view_id: str | None + ) -> bool: + if self.replica_repository.has_any_replica(artifact_id, view_id): + return True + if view_id is None: + return False + return self.replica_repository.has_any_replica(artifact_id, None) + + def _select_transport_source( + self, + *, + artifact_id: str, + view_id: str | None, + heartbeat_timeout_seconds: float, + scheduler_mode: str, + source_balance_weights: SourceBalanceWeights | None = None, + cursor=None, + ): + primary = self.replica_repository.find_available_for_transport( + artifact_id=artifact_id, + view_id=view_id, + heartbeat_timeout_seconds=heartbeat_timeout_seconds, + scheduler_mode=scheduler_mode, + source_balance_weights=source_balance_weights, + cursor=cursor, + ) + if primary.replica is not None or view_id is None: + return primary + return self.replica_repository.find_available_for_transport( + artifact_id=artifact_id, + view_id=None, + heartbeat_timeout_seconds=heartbeat_timeout_seconds, + scheduler_mode=scheduler_mode, + source_balance_weights=source_balance_weights, + cursor=cursor, + ) + @staticmethod def _normalize_request_id(request_id: str) -> str: normalized = request_id.strip() @@ -291,7 +329,7 @@ def _request_transport_group_dispatch( start_time = time.time() timeout_deadline = start_time + (max(0, wait_timeout_ms) / 1000.0) - if not self.replica_repository.has_any_replica(artifact_id, view_id): + if not self._has_any_transport_route(artifact_id=artifact_id, view_id=view_id): inc_transport_request(artifact_id, "not_found") observe_transport_wait(artifact_id, time.time() - start_time) raise NotFoundError(f"No replicas registered for artifact {artifact_id}") @@ -317,7 +355,9 @@ def _request_transport_group_dispatch( try: with self.replica_repository.transaction() as tx: - self.pending_transport_request_repository.purge_malformed_rows(cursor=tx) + self.pending_transport_request_repository.purge_malformed_rows( + cursor=tx + ) existing_pending = ( self.pending_transport_request_repository.find_by_request_id( request_id, cursor=tx @@ -756,7 +796,7 @@ def _dispatch_pending_requests(self, *, tx) -> int: if dispatched >= dispatch_limit: break - selection = self.replica_repository.find_available_for_transport( + selection = self._select_transport_source( artifact_id=pending_request.artifact_id, view_id=pending_request.requested_view_id, heartbeat_timeout_seconds=self.config.heartbeat_timeout_ms / 1000, @@ -869,7 +909,7 @@ def complete_transport( except DatabaseError as exc: # Keep public contract stable for callers/tests. if isinstance(exc.__cause__, NotFoundError): - raise exc.__cause__ + raise exc.__cause__ from None raise if released: @@ -993,8 +1033,7 @@ def cleanup_expired_transports(self, expiration_seconds: int | None = None) -> i transport.transport_id, outcome=TransportCompletionOutcome.EXPIRED, outcome_detail=( - "cleanup_malformed_inflight " - f"status={transport.status}" + f"cleanup_malformed_inflight status={transport.status}" ), ) malformed_cleaned += 1 diff --git a/tests/python/global_store/test_grpc_service.py b/tests/python/global_store/test_grpc_service.py index a4d1c74c4..22718e466 100644 --- a/tests/python/global_store/test_grpc_service.py +++ b/tests/python/global_store/test_grpc_service.py @@ -935,6 +935,14 @@ def test_request_replica_transport_respects_byte_space( ), test_context, ) + servicer.view_repository.upsert( + artifact_id=artifact_id, + view_id=view_id, + view_spec_json='{"kind":"dense_view"}', + view_size=256, + view_data_hash="view-hash-1", + verified_at=None, + ) servicer.WorkerHeartbeat( global_store_pb2.WorkerHeartbeatRequest( worker_id=registered_worker, @@ -944,6 +952,8 @@ def test_request_replica_transport_respects_byte_space( ), test_context, ) + worker = servicer.worker_repository.find_by_worker_id(registered_worker) + assert worker is not None view_request = global_store_pb2.RequestReplicaTransportRequest( artifact_id=artifact_id, @@ -959,11 +969,19 @@ def test_request_replica_transport_respects_byte_space( ) view_response = servicer.RequestReplicaTransport(view_request, test_context) assert view_response.status == global_store_pb2.Status.STATUS_OK + assert ( + view_response.route_kind + == global_store_pb2.TransportRouteKind.TRANSPORT_ROUTE_KIND_RESIDENT_VIEW + ) assert ( view_response.remote_memory_info.byte_space.kind == common_pb2.BYTE_SPACE_KIND_VIEW ) assert view_response.remote_memory_info.byte_space.id == view_id + assert view_response.view_transport_metadata.view_id == view_id + assert view_response.view_transport_metadata.view_size_bytes == 256 + assert view_response.view_transport_metadata.view_data_hash == "view-hash-1" + assert view_response.source_grpc_port == worker.grpc_port canonical_request = global_store_pb2.RequestReplicaTransportRequest( artifact_id=artifact_id, @@ -981,10 +999,79 @@ def test_request_replica_transport_respects_byte_space( canonical_request, test_context ) assert canonical_response.status == global_store_pb2.Status.STATUS_OK + assert ( + canonical_response.route_kind + == global_store_pb2.TransportRouteKind.TRANSPORT_ROUTE_KIND_CANONICAL + ) assert ( canonical_response.remote_memory_info.byte_space.kind == common_pb2.BYTE_SPACE_KIND_CANONICAL ) + assert not canonical_response.HasField("view_transport_metadata") + assert canonical_response.source_grpc_port == worker.grpc_port + + def test_request_replica_transport_routes_canonical_source_for_view_miss( + self, servicer, test_context, memory_info, registered_worker + ): + artifact_id = "mi2:artifact-bytespace-derived" + view_id = "view-derived-1" + servicer.RegisterReplica( + global_store_pb2.RegisterReplicaRequest( + artifact_id=artifact_id, + mem_info=memory_info, + max_concurrency=1, + worker_id=registered_worker, + ), + test_context, + ) + servicer.view_repository.upsert( + artifact_id=artifact_id, + view_id=view_id, + view_spec_json='{"kind":"dense_view"}', + view_size=2048, + view_data_hash="derived-hash-1", + verified_at=None, + ) + servicer.WorkerHeartbeat( + global_store_pb2.WorkerHeartbeatRequest( + worker_id=registered_worker, + mem_pool_available_size=7000000000, + accepting_new_requests=True, + state_version=1, + ), + test_context, + ) + worker = servicer.worker_repository.find_by_worker_id(registered_worker) + assert worker is not None + + request = global_store_pb2.RequestReplicaTransportRequest( + artifact_id=artifact_id, + local_memory_info=memory_info, + wait_timeout_dur=duration_pb2.Duration(seconds=1), + source_node_id="source_node", + source_address="192.168.1.2", + source_port=9000, + request_id="transport-bytespace-derived-1", + requested_byte_space=common_pb2.ByteSpaceRef( + kind=common_pb2.BYTE_SPACE_KIND_VIEW, id=view_id + ), + ) + + response = servicer.RequestReplicaTransport(request, test_context) + + assert response.status == global_store_pb2.Status.STATUS_OK + assert ( + response.route_kind + == global_store_pb2.TransportRouteKind.TRANSPORT_ROUTE_KIND_DERIVED_VIEW_FROM_CANONICAL + ) + assert ( + response.remote_memory_info.byte_space.kind + == common_pb2.BYTE_SPACE_KIND_CANONICAL + ) + assert response.view_transport_metadata.view_id == view_id + assert response.view_transport_metadata.view_size_bytes == 2048 + assert response.view_transport_metadata.view_data_hash == "derived-hash-1" + assert response.source_grpc_port == worker.grpc_port def test_get_artifact_index_by_id_with_multibase( self, servicer, test_context, memory_info, registered_worker @@ -1870,6 +1957,85 @@ def test_update_artifact_view_state_roundtrip(self, servicer, test_context): assert detail.missing_leaf_ranges[0].count == 1 assert test_context.code == grpc.StatusCode.NOT_FOUND + def test_update_artifact_view_state_accepts_metadata_only_view_residency( + self, servicer, test_context + ): + artifact_id = "mi2:index_hash:view_residency" + servicer.artifacts_repo.upsert_artifact( + artifact_id=artifact_id, + index_multihash="index_hash", + data_multihash="data_hash", + schema_version="v3", + encoding="json", + ) + + create_resp = servicer.UpdateArtifactViewState( + global_store_pb2.UpdateArtifactViewStateRequest( + artifact_id=artifact_id, + view=global_store_pb2.ViewUpsert( + view_id="view-resident", + view_size=8192, + view_data_hash="hash-a", + ), + ), + test_context, + ) + assert create_resp.status == global_store_pb2.Status.STATUS_OK + assert test_context.code is None + + test_context.code = None + list_resp = servicer.ListViews( + global_store_pb2.ListViewsRequest(artifact_id=artifact_id), + test_context, + ) + assert list_resp.status == global_store_pb2.Status.STATUS_OK + assert len(list_resp.views) == 1 + assert list_resp.views[0].view_id == "view-resident" + assert list_resp.views[0].view_spec_json == "" + assert list_resp.views[0].view_size == 8192 + assert list_resp.views[0].view_data_hash == "hash-a" + assert test_context.code is None + + test_context.code = None + preserve_resp = servicer.UpdateArtifactViewState( + global_store_pb2.UpdateArtifactViewStateRequest( + artifact_id=artifact_id, + view=global_store_pb2.ViewUpsert( + view_id="view-resident", + view_spec_json="{\"kind\":\"tp\"}", + view_size=8192, + view_data_hash="hash-a", + ), + ), + test_context, + ) + assert preserve_resp.status == global_store_pb2.Status.STATUS_OK + assert test_context.code is None + + test_context.code = None + servicer.UpdateArtifactViewState( + global_store_pb2.UpdateArtifactViewStateRequest( + artifact_id=artifact_id, + view=global_store_pb2.ViewUpsert( + view_id="view-resident", + view_size=8192, + ), + ), + test_context, + ) + assert test_context.code is None + + test_context.code = None + preserved_list_resp = servicer.ListViews( + global_store_pb2.ListViewsRequest(artifact_id=artifact_id), + test_context, + ) + assert preserved_list_resp.status == global_store_pb2.Status.STATUS_OK + assert len(preserved_list_resp.views) == 1 + assert preserved_list_resp.views[0].view_spec_json == "{\"kind\":\"tp\"}" + assert preserved_list_resp.views[0].view_data_hash == "hash-a" + assert test_context.code is None + def test_write_tensor_proof_commitments_roundtrip(self, servicer, test_context): artifact_id = "mi2:index_hash:data_hash" servicer.artifacts_repo.upsert_artifact( From 2d37f5b45f3ec5e531b6cbce22d636c6e8042a6a Mon Sep 17 00:00:00 2001 From: zhouyuhan Date: Thu, 19 Mar 2026 19:36:07 +0800 Subject: [PATCH 09/10] fix(cli): tensorcast.runtime import ambiguity --- tensorcast/cli.py | 14 +++++++------- tensorcast/startup.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tensorcast/cli.py b/tensorcast/cli.py index c014335b9..a28895768 100644 --- a/tensorcast/cli.py +++ b/tensorcast/cli.py @@ -9,7 +9,7 @@ import click -from tensorcast import runtime +import tensorcast.runtime as runtime_module from tensorcast.cli_utils import ServiceError, global_store_manager, service_manager from tensorcast.cli_utils.health import ping_global_store from tensorcast.cli_utils.network import resolve_connect_host @@ -61,7 +61,7 @@ def _parse_endpoint(value: str) -> str: return f"{host}:{port}" -def _runtime_session_to_dict(session: runtime.RuntimeSession) -> dict: +def _runtime_session_to_dict(session: runtime_module.RuntimeSession) -> dict: return { "session_id": session.session_id, "daemon": { @@ -82,7 +82,7 @@ def _runtime_session_to_dict(session: runtime.RuntimeSession) -> dict: } -def _echo_daemon_status(session: runtime.RuntimeSession) -> None: +def _echo_daemon_status(session: runtime_module.RuntimeSession) -> None: click.echo(f"Daemon session: {session.session_id}") if session.daemon_pid: click.echo(f" status : running (pid={session.daemon_pid})") @@ -292,7 +292,7 @@ def _check_conflict(path: str) -> None: global_store_mode = "connect" global_store_address = _parse_endpoint(global_store_address) - existing = runtime.status() + existing = runtime_module.status() if existing is not None: click.echo( "Error: A StoreDaemon is already running. Use the existing daemon " @@ -305,7 +305,7 @@ def _check_conflict(path: str) -> None: _echo_daemon_status(existing) sys.exit(1) - session_obj = runtime.start( + session_obj = runtime_module.start( daemon_config=config, session_id=session, global_store_mode=global_store_mode, @@ -343,7 +343,7 @@ def _check_conflict(path: str) -> None: def daemon_stop(force: bool, session: str | None): """Stop the Store Daemon.""" try: - runtime.stop(session_id=session, force=force) + runtime_module.stop(session_id=session, force=force) except ServiceError as e: click.echo(f"Error: {e}", err=True) sys.exit(1) @@ -368,7 +368,7 @@ def daemon_stop(force: bool, session: str | None): def daemon_status(session: str | None, as_json: bool): """Check daemon status using runtime state + health checks.""" try: - session_obj = runtime.status(session) + session_obj = runtime_module.status(session) if session_obj is None: click.echo( "No local daemon session found. Start one with 'tensorcast daemon start'." diff --git a/tensorcast/startup.py b/tensorcast/startup.py index 6eddec142..760c05da9 100644 --- a/tensorcast/startup.py +++ b/tensorcast/startup.py @@ -27,7 +27,7 @@ from pydantic import BaseModel, ConfigDict, field_validator -from tensorcast import runtime +import tensorcast.runtime as runtime_module from tensorcast.api._config import clear_daemon_address, set_daemon_address from tensorcast.cli_utils.config import discover_daemon_config from tensorcast.cli_utils.health import ping_daemon, wait_for_daemon @@ -141,7 +141,7 @@ def close(self) -> None: if self.is_owner and self.session_id: # Stop daemon session we launched. try: - runtime.stop(session_id=self.session_id) + runtime_module.stop(session_id=self.session_id) finally: _clear_auto_state_if_matches(self.session_id, self.address) self._closed = True @@ -190,7 +190,7 @@ def _handle(signum, _frame): # noqa: ANN001 def _current_session_address() -> str | None: - session = runtime.status() + session = runtime_module.status() if session and session.daemon_address: return session.daemon_address from tensorcast.cli_utils.service_manager import get_session_address @@ -513,7 +513,7 @@ def _start_context( logger, ) -> Context: global _current_ctx - session_obj = runtime.start( + session_obj = runtime_module.start( daemon_config=cfg_path, session_id=session_id, global_store_mode=global_store_mode, @@ -615,7 +615,7 @@ def _init_auto_mode( ) timeout_s = _auto_wait_timeout_seconds() while True: - existing = runtime.status() + existing = runtime_module.status() existing_address = ( existing.daemon_address if existing From 92a03ea2a1d645fe64e3c711c90107943daaf14b Mon Sep 17 00:00:00 2001 From: zhouyuhan Date: Thu, 19 Mar 2026 19:37:07 +0800 Subject: [PATCH 10/10] fix(artifact): normalize artifact id resolvation --- tensorcast/api/store/artifact.py | 8 +++++++- tensorcast/tools/weight_publisher.py | 15 ++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/tensorcast/api/store/artifact.py b/tensorcast/api/store/artifact.py index da6407594..bdb86ec82 100644 --- a/tensorcast/api/store/artifact.py +++ b/tensorcast/api/store/artifact.py @@ -2601,7 +2601,13 @@ def _ensure_identified(self) -> str: if self._artifact_id: return self._artifact_id if self._key_hint: - artifact_id = runtime.resolve_key_mapping_cached(key=self._key_hint) + resolved_mapping = runtime.resolve_key_mapping_cached( + key=self._key_hint + ) + if isinstance(resolved_mapping, tuple): + artifact_id = resolved_mapping[0] + else: + artifact_id = resolved_mapping if not artifact_id: raise ArtifactError( f"Artifact key '{self._key_hint}' is not mapped", diff --git a/tensorcast/tools/weight_publisher.py b/tensorcast/tools/weight_publisher.py index 509d8ea16..3e440b692 100644 --- a/tensorcast/tools/weight_publisher.py +++ b/tensorcast/tools/weight_publisher.py @@ -31,6 +31,17 @@ _CGID_SUFFIX_ALLOWED = re.compile(r"[-._~A-Za-z0-9]+") +def _normalize_resolved_artifact_id(value: object) -> str | None: + if isinstance(value, tuple): + if not value: + return None + value = value[0] + if value is None: + return None + resolved = str(value).strip() + return resolved or None + + class WeightPublisherConfig(BaseModel): """Configuration for publishing weights to Tensorcast and triggering reloads. @@ -562,7 +573,9 @@ def _wait_for_key_mapping( last_err: Exception | None = None while time.monotonic() < deadline: try: - resolved = resolve_artifact(key=str(artifact_key)).artifact_id + resolved = _normalize_resolved_artifact_id( + resolve_artifact(key=str(artifact_key)).artifact_id + ) except Exception as exc: # noqa: BLE001 last_err = exc time.sleep(float(self._config.key_mapping_poll_interval_s))