@@ -1906,6 +1906,18 @@ namespace phy_engine::verilog::digital
19061906 ::std::uint64_t const* off_blocks,
19071907 ::std::uint32_t off_words,
19081908 ::std::uint8_t* out_hits) noexcept;
1909+
1910+ // Optional optimization: keep OFF-set resident on device(s) across many hits-off queries.
1911+ // This is particularly important for Espresso-style loops which may do many incremental checks.
1912+ extern "C" void* phy_engine_pe_synth_cuda_espresso_off_create(::std::uint32_t device_mask,
1913+ ::std::uint32_t var_count,
1914+ ::std::uint64_t const* off_blocks,
1915+ ::std::uint32_t off_words) noexcept;
1916+ extern "C" void phy_engine_pe_synth_cuda_espresso_off_destroy(void* handle) noexcept;
1917+ extern "C" bool phy_engine_pe_synth_cuda_espresso_off_hits(void* handle,
1918+ cuda_cube_desc const* cubes,
1919+ ::std::size_t cube_count,
1920+ ::std::uint8_t* out_hits) noexcept;
19091921#endif
19101922
19111923 [[nodiscard]] inline bool
@@ -2125,6 +2137,103 @@ namespace phy_engine::verilog::digital
21252137#endif
21262138 }
21272139
2140+ struct cuda_espresso_off_raii
2141+ {
2142+ void* handle{};
2143+ ::std::uint32_t off_words{};
2144+
2145+ cuda_espresso_off_raii() noexcept = default;
2146+ cuda_espresso_off_raii(cuda_espresso_off_raii const&) = delete;
2147+ cuda_espresso_off_raii& operator=(cuda_espresso_off_raii const&) = delete;
2148+
2149+ cuda_espresso_off_raii(cuda_espresso_off_raii&& o) noexcept : handle(o.handle), off_words(o.off_words)
2150+ {
2151+ o.handle = nullptr;
2152+ o.off_words = 0u;
2153+ }
2154+ cuda_espresso_off_raii& operator=(cuda_espresso_off_raii&& o) noexcept
2155+ {
2156+ if(this == &o) { return *this; }
2157+ reset();
2158+ handle = o.handle;
2159+ off_words = o.off_words;
2160+ o.handle = nullptr;
2161+ o.off_words = 0u;
2162+ return *this;
2163+ }
2164+
2165+ ~cuda_espresso_off_raii() noexcept { reset(); }
2166+
2167+ void reset() noexcept
2168+ {
2169+ #if defined(PHY_ENGINE_ENABLE_CUDA_PE_SYNTH)
2170+ if(handle != nullptr) { phy_engine_pe_synth_cuda_espresso_off_destroy(handle); }
2171+ #endif
2172+ handle = nullptr;
2173+ off_words = 0u;
2174+ }
2175+ };
2176+
2177+ [[nodiscard]] inline cuda_espresso_off_raii cuda_espresso_off_create(::std::uint32_t device_mask,
2178+ ::std::uint32_t var_count,
2179+ ::std::uint64_t const* off_blocks,
2180+ ::std::uint32_t off_words) noexcept
2181+ {
2182+ cuda_espresso_off_raii r{};
2183+ r.off_words = off_words;
2184+ #if defined(PHY_ENGINE_ENABLE_CUDA_PE_SYNTH)
2185+ if(off_blocks == nullptr || off_words == 0u || var_count == 0u || var_count > 16u) { return r; }
2186+ auto const t0 = ::std::chrono::steady_clock::now();
2187+ r.handle = phy_engine_pe_synth_cuda_espresso_off_create(device_mask, var_count, off_blocks, off_words);
2188+ auto const us =
2189+ static_cast<std::size_t>(::std::chrono::duration_cast<::std::chrono::microseconds>(::std::chrono::steady_clock::now() - t0).count());
2190+ cuda_trace_add(u8"espresso_off_create",
2191+ 0u,
2192+ off_words,
2193+ static_cast<std::size_t>(off_words) * sizeof(::std::uint64_t),
2194+ 0u,
2195+ us,
2196+ r.handle != nullptr);
2197+ #else
2198+ (void)device_mask;
2199+ (void)var_count;
2200+ (void)off_blocks;
2201+ (void)off_words;
2202+ cuda_trace_add(u8"espresso_off_create", 0u, off_words, 0u, 0u, 0u, false, u8"not_built");
2203+ #endif
2204+ return r;
2205+ }
2206+
2207+ [[nodiscard]] inline bool cuda_espresso_off_hits(cuda_espresso_off_raii const& off,
2208+ cuda_cube_desc const* cubes,
2209+ ::std::size_t cube_count,
2210+ ::std::uint8_t* out_hits) noexcept
2211+ {
2212+ #if defined(PHY_ENGINE_ENABLE_CUDA_PE_SYNTH)
2213+ if(off.handle == nullptr || cubes == nullptr || out_hits == nullptr || cube_count == 0u) { return false; }
2214+ auto const t0 = ::std::chrono::steady_clock::now();
2215+ bool const ok = phy_engine_pe_synth_cuda_espresso_off_hits(off.handle, cubes, cube_count, out_hits);
2216+ auto const us =
2217+ static_cast<std::size_t>(::std::chrono::duration_cast<::std::chrono::microseconds>(::std::chrono::steady_clock::now() - t0).count());
2218+ // OFF-set already resident, so H2D is just the cubes; D2H is the hits vector.
2219+ cuda_trace_add(u8"espresso_hits_off",
2220+ cube_count,
2221+ off.off_words,
2222+ cube_count * sizeof(cuda_cube_desc),
2223+ cube_count * sizeof(::std::uint8_t),
2224+ us,
2225+ ok);
2226+ return ok;
2227+ #else
2228+ (void)off;
2229+ (void)cubes;
2230+ (void)cube_count;
2231+ (void)out_hits;
2232+ cuda_trace_add(u8"espresso_hits_off", cube_count, off.off_words, 0u, 0u, 0u, false, u8"not_built");
2233+ return false;
2234+ #endif
2235+ }
2236+
21282237 [[nodiscard]] inline ::std::uint64_t eval_u64_cone_cpu(cuda_u64_cone_desc const& c) noexcept
21292238 {
21302239 // Leaf patterns (variable i toggles every 2^i minterms, with bit0 corresponding to minterm 0).
@@ -10501,6 +10610,16 @@ namespace phy_engine::verilog::digital
1050110610 return false;
1050210611 };
1050310612
10613+ // Optional: keep OFF-set resident on GPU across the Espresso loop to avoid re-uploading it for each query.
10614+ // This matters because Espresso can generate a very large number of incremental "hits-off" checks.
10615+ cuda_espresso_off_raii off_gpu{};
10616+ bool use_off_gpu{};
10617+ if(opt.cuda_enable && blocksU != 0u && var_count <= 16u)
10618+ {
10619+ off_gpu = cuda_espresso_off_create(opt.cuda_device_mask, static_cast<::std::uint32_t>(var_count), off_bits.data(), static_cast<::std::uint32_t>(blocksU));
10620+ use_off_gpu = (off_gpu.handle != nullptr);
10621+ }
10622+
1050410623 auto cube_hits_off_batch = [&](::std::vector<qm_implicant> const& cubes, ::std::vector<std::uint8_t>& out_hits) noexcept -> void
1050510624 {
1050610625 out_hits.assign(cubes.size(), 0u);
@@ -10519,13 +10638,20 @@ namespace phy_engine::verilog::digital
1051910638 desc[i].value = cubes[i].value;
1052010639 desc[i].mask = cubes[i].mask;
1052110640 }
10522- used_cuda = cuda_espresso_cube_hits_off(opt.cuda_device_mask,
10523- desc.data(),
10524- desc.size(),
10525- static_cast<std::uint32_t>(var_count),
10526- off_bits.data(),
10527- static_cast<std::uint32_t>(blocksU),
10528- out_hits.data());
10641+ if(use_off_gpu)
10642+ {
10643+ used_cuda = cuda_espresso_off_hits(off_gpu, desc.data(), desc.size(), out_hits.data());
10644+ }
10645+ else
10646+ {
10647+ used_cuda = cuda_espresso_cube_hits_off(opt.cuda_device_mask,
10648+ desc.data(),
10649+ desc.size(),
10650+ static_cast<std::uint32_t>(var_count),
10651+ off_bits.data(),
10652+ static_cast<std::uint32_t>(blocksU),
10653+ out_hits.data());
10654+ }
1052910655 }
1053010656 else
1053110657 {
@@ -10569,46 +10695,106 @@ namespace phy_engine::verilog::digital
1056910695 auto expand_cover_batch = [&]() noexcept -> bool
1057010696 {
1057110697 if(cover.empty() || var_order.empty()) { return false; }
10698+ // GPU-friendly structural batching:
10699+ // The classic per-variable expand loop creates millions of tiny "hits-off" batches.
10700+ // Here we generate a larger pool per round (single- and double-literal relaxations),
10701+ // query hits-off in one batched call, then apply the best safe candidate per cube.
1057210702 bool any{};
10573- bool changed{true};
10574- ::std::vector<std::size_t> idx{};
10703+ constexpr std::size_t max_rounds = 3;
10704+ auto const var_limit = ::std::min<std::size_t>(var_order.size(), 12u);
10705+
10706+ struct cand_meta
10707+ {
10708+ std::size_t cube_idx{};
10709+ std::uint8_t drop_cnt{};
10710+ };
10711+
10712+ ::std::vector<cand_meta> meta{};
1057510713 ::std::vector<qm_implicant> cands{};
1057610714 ::std::vector<std::uint8_t> hits{};
1057710715
10578- while(changed )
10716+ for(std::size_t round{}; round < max_rounds; ++round )
1057910717 {
10580- changed = false;
10581- for(auto const v: var_order)
10718+ meta.clear();
10719+ cands.clear();
10720+
10721+ // Conservative reserve; still bounded.
10722+ meta.reserve(cover.size() * (var_limit + (var_limit * (var_limit - 1u)) / 2u));
10723+ cands.reserve(cover.size() * (var_limit + (var_limit * (var_limit - 1u)) / 2u));
10724+
10725+ for(std::size_t ci{}; ci < cover.size(); ++ci)
1058210726 {
10583- idx.clear();
10584- cands.clear();
10585- idx.reserve(cover.size() / 2u + 1u);
10586- cands.reserve(cover.size() / 2u + 1u);
10727+ auto const& c0 = cover[ci];
10728+ ::std::vector<::std::uint16_t> bits{};
10729+ bits.reserve(var_limit);
10730+ for(std::size_t oi{}; oi < var_limit; ++oi)
10731+ {
10732+ auto const v = var_order[oi];
10733+ auto const bit = static_cast<::std::uint16_t>(1u << v);
10734+ if(((c0.mask >> v) & 1u) != 0u) { continue; }
10735+ bits.push_back(bit);
10736+ }
10737+ if(bits.empty()) { continue; }
1058710738
10588- auto const bit = static_cast<::std::uint16_t>(1u << v);
10589- for(std::size_t i{}; i < cover.size(); ++i)
10590- {
10591- auto const& c = cover[i];
10592- if((c.mask >> v) & 1u) { continue; } // already don't care
10593- qm_implicant cand = c;
10594- cand.mask = static_cast<::std::uint16_t>(cand.mask | bit);
10595- cand.value = static_cast<::std::uint16_t>(cand.value & static_cast<::std::uint16_t>(~bit));
10596- idx.push_back(i);
10739+ for(auto const b1 : bits)
10740+ {
10741+ qm_implicant cand = c0;
10742+ cand.mask = static_cast<::std::uint16_t>(cand.mask | b1);
10743+ cand.value = static_cast<::std::uint16_t>(cand.value & static_cast<::std::uint16_t>(~b1));
10744+ meta.push_back(cand_meta{ci, 1u});
1059710745 cands.push_back(cand);
1059810746 }
10599- if(cands.empty()) { continue; }
1060010747
10601- cube_hits_off_batch(cands, hits);
10602- if(hits.size() != cands.size()) { continue; }
10748+ for(std::size_t i{}; i < bits.size(); ++i)
10749+ {
10750+ for(std::size_t j{i + 1u}; j < bits.size(); ++j)
10751+ {
10752+ auto const b = static_cast<::std::uint16_t>(bits[i] | bits[j]);
10753+ qm_implicant cand = c0;
10754+ cand.mask = static_cast<::std::uint16_t>(cand.mask | b);
10755+ cand.value = static_cast<::std::uint16_t>(cand.value & static_cast<::std::uint16_t>(~b));
10756+ meta.push_back(cand_meta{ci, 2u});
10757+ cands.push_back(cand);
10758+ }
10759+ }
10760+ }
10761+
10762+ if(cands.empty()) { break; }
1060310763
10604- for(std::size_t k{}; k < cands.size(); ++k)
10764+ cube_hits_off_batch(cands, hits);
10765+ if(hits.size() != cands.size()) { break; }
10766+
10767+ ::std::vector<int> best{};
10768+ ::std::vector<std::uint8_t> best_drop{};
10769+ best.assign(cover.size(), -1);
10770+ best_drop.assign(cover.size(), 0u);
10771+
10772+ for(std::size_t k{}; k < cands.size(); ++k)
10773+ {
10774+ if(hits[k]) { continue; }
10775+ auto const ci = meta[k].cube_idx;
10776+ if(ci >= cover.size()) { continue; }
10777+ auto const dcnt = meta[k].drop_cnt;
10778+ if(dcnt > best_drop[ci])
1060510779 {
10606- if(hits[k]) { continue; }
10607- cover[idx[k]] = cands[k];
10608- changed = true;
10609- any = true;
10780+ best_drop[ci] = dcnt;
10781+ best[ci] = static_cast<int>(k);
1061010782 }
1061110783 }
10784+
10785+ bool changed{};
10786+ for(std::size_t ci{}; ci < cover.size(); ++ci)
10787+ {
10788+ auto const bi = best[ci];
10789+ if(bi < 0) { continue; }
10790+ auto const cand = cands[static_cast<std::size_t>(bi)];
10791+ if(cand.mask == cover[ci].mask && cand.value == cover[ci].value) { continue; }
10792+ cover[ci] = cand;
10793+ changed = true;
10794+ any = true;
10795+ }
10796+
10797+ if(!changed) { break; }
1061210798 }
1061310799 return any;
1061410800 };
0 commit comments