Skip to content
This repository was archived by the owner on Apr 29, 2026. It is now read-only.

Commit 3f1e5f1

Browse files
committed
Add CUDA support for Espresso OFF-set management in synthesis
- Introduced new structures and functions for managing OFF-set data on CUDA devices, enhancing performance for incremental checks in Espresso-style loops. - Implemented RAII-style management for CUDA resources, ensuring proper cleanup and resource handling. - Updated existing synthesis logic to utilize the new CUDA capabilities, improving efficiency and reducing overhead during synthesis operations. - Enhanced telemetry for CUDA operations related to OFF-set management, providing better insights into performance metrics.
1 parent 0b58982 commit 3f1e5f1

2 files changed

Lines changed: 469 additions & 63 deletions

File tree

include/phy_engine/verilog/digital/pe_synth.h

Lines changed: 219 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1906,6 +1906,18 @@ namespace phy_engine::verilog::digital
19061906
::std::uint64_t const* off_blocks,
19071907
::std::uint32_t off_words,
19081908
::std::uint8_t* out_hits) noexcept;
1909+
1910+
// Optional optimization: keep OFF-set resident on device(s) across many hits-off queries.
1911+
// This is particularly important for Espresso-style loops which may do many incremental checks.
1912+
extern "C" void* phy_engine_pe_synth_cuda_espresso_off_create(::std::uint32_t device_mask,
1913+
::std::uint32_t var_count,
1914+
::std::uint64_t const* off_blocks,
1915+
::std::uint32_t off_words) noexcept;
1916+
extern "C" void phy_engine_pe_synth_cuda_espresso_off_destroy(void* handle) noexcept;
1917+
extern "C" bool phy_engine_pe_synth_cuda_espresso_off_hits(void* handle,
1918+
cuda_cube_desc const* cubes,
1919+
::std::size_t cube_count,
1920+
::std::uint8_t* out_hits) noexcept;
19091921
#endif
19101922

19111923
[[nodiscard]] inline bool
@@ -2125,6 +2137,103 @@ namespace phy_engine::verilog::digital
21252137
#endif
21262138
}
21272139

2140+
struct cuda_espresso_off_raii
2141+
{
2142+
void* handle{};
2143+
::std::uint32_t off_words{};
2144+
2145+
cuda_espresso_off_raii() noexcept = default;
2146+
cuda_espresso_off_raii(cuda_espresso_off_raii const&) = delete;
2147+
cuda_espresso_off_raii& operator=(cuda_espresso_off_raii const&) = delete;
2148+
2149+
cuda_espresso_off_raii(cuda_espresso_off_raii&& o) noexcept : handle(o.handle), off_words(o.off_words)
2150+
{
2151+
o.handle = nullptr;
2152+
o.off_words = 0u;
2153+
}
2154+
cuda_espresso_off_raii& operator=(cuda_espresso_off_raii&& o) noexcept
2155+
{
2156+
if(this == &o) { return *this; }
2157+
reset();
2158+
handle = o.handle;
2159+
off_words = o.off_words;
2160+
o.handle = nullptr;
2161+
o.off_words = 0u;
2162+
return *this;
2163+
}
2164+
2165+
~cuda_espresso_off_raii() noexcept { reset(); }
2166+
2167+
void reset() noexcept
2168+
{
2169+
#if defined(PHY_ENGINE_ENABLE_CUDA_PE_SYNTH)
2170+
if(handle != nullptr) { phy_engine_pe_synth_cuda_espresso_off_destroy(handle); }
2171+
#endif
2172+
handle = nullptr;
2173+
off_words = 0u;
2174+
}
2175+
};
2176+
2177+
[[nodiscard]] inline cuda_espresso_off_raii cuda_espresso_off_create(::std::uint32_t device_mask,
2178+
::std::uint32_t var_count,
2179+
::std::uint64_t const* off_blocks,
2180+
::std::uint32_t off_words) noexcept
2181+
{
2182+
cuda_espresso_off_raii r{};
2183+
r.off_words = off_words;
2184+
#if defined(PHY_ENGINE_ENABLE_CUDA_PE_SYNTH)
2185+
if(off_blocks == nullptr || off_words == 0u || var_count == 0u || var_count > 16u) { return r; }
2186+
auto const t0 = ::std::chrono::steady_clock::now();
2187+
r.handle = phy_engine_pe_synth_cuda_espresso_off_create(device_mask, var_count, off_blocks, off_words);
2188+
auto const us =
2189+
static_cast<std::size_t>(::std::chrono::duration_cast<::std::chrono::microseconds>(::std::chrono::steady_clock::now() - t0).count());
2190+
cuda_trace_add(u8"espresso_off_create",
2191+
0u,
2192+
off_words,
2193+
static_cast<std::size_t>(off_words) * sizeof(::std::uint64_t),
2194+
0u,
2195+
us,
2196+
r.handle != nullptr);
2197+
#else
2198+
(void)device_mask;
2199+
(void)var_count;
2200+
(void)off_blocks;
2201+
(void)off_words;
2202+
cuda_trace_add(u8"espresso_off_create", 0u, off_words, 0u, 0u, 0u, false, u8"not_built");
2203+
#endif
2204+
return r;
2205+
}
2206+
2207+
[[nodiscard]] inline bool cuda_espresso_off_hits(cuda_espresso_off_raii const& off,
2208+
cuda_cube_desc const* cubes,
2209+
::std::size_t cube_count,
2210+
::std::uint8_t* out_hits) noexcept
2211+
{
2212+
#if defined(PHY_ENGINE_ENABLE_CUDA_PE_SYNTH)
2213+
if(off.handle == nullptr || cubes == nullptr || out_hits == nullptr || cube_count == 0u) { return false; }
2214+
auto const t0 = ::std::chrono::steady_clock::now();
2215+
bool const ok = phy_engine_pe_synth_cuda_espresso_off_hits(off.handle, cubes, cube_count, out_hits);
2216+
auto const us =
2217+
static_cast<std::size_t>(::std::chrono::duration_cast<::std::chrono::microseconds>(::std::chrono::steady_clock::now() - t0).count());
2218+
// OFF-set already resident, so H2D is just the cubes; D2H is the hits vector.
2219+
cuda_trace_add(u8"espresso_hits_off",
2220+
cube_count,
2221+
off.off_words,
2222+
cube_count * sizeof(cuda_cube_desc),
2223+
cube_count * sizeof(::std::uint8_t),
2224+
us,
2225+
ok);
2226+
return ok;
2227+
#else
2228+
(void)off;
2229+
(void)cubes;
2230+
(void)cube_count;
2231+
(void)out_hits;
2232+
cuda_trace_add(u8"espresso_hits_off", cube_count, off.off_words, 0u, 0u, 0u, false, u8"not_built");
2233+
return false;
2234+
#endif
2235+
}
2236+
21282237
[[nodiscard]] inline ::std::uint64_t eval_u64_cone_cpu(cuda_u64_cone_desc const& c) noexcept
21292238
{
21302239
// Leaf patterns (variable i toggles every 2^i minterms, with bit0 corresponding to minterm 0).
@@ -10501,6 +10610,16 @@ namespace phy_engine::verilog::digital
1050110610
return false;
1050210611
};
1050310612

10613+
// Optional: keep OFF-set resident on GPU across the Espresso loop to avoid re-uploading it for each query.
10614+
// This matters because Espresso can generate a very large number of incremental "hits-off" checks.
10615+
cuda_espresso_off_raii off_gpu{};
10616+
bool use_off_gpu{};
10617+
if(opt.cuda_enable && blocksU != 0u && var_count <= 16u)
10618+
{
10619+
off_gpu = cuda_espresso_off_create(opt.cuda_device_mask, static_cast<::std::uint32_t>(var_count), off_bits.data(), static_cast<::std::uint32_t>(blocksU));
10620+
use_off_gpu = (off_gpu.handle != nullptr);
10621+
}
10622+
1050410623
auto cube_hits_off_batch = [&](::std::vector<qm_implicant> const& cubes, ::std::vector<std::uint8_t>& out_hits) noexcept -> void
1050510624
{
1050610625
out_hits.assign(cubes.size(), 0u);
@@ -10519,13 +10638,20 @@ namespace phy_engine::verilog::digital
1051910638
desc[i].value = cubes[i].value;
1052010639
desc[i].mask = cubes[i].mask;
1052110640
}
10522-
used_cuda = cuda_espresso_cube_hits_off(opt.cuda_device_mask,
10523-
desc.data(),
10524-
desc.size(),
10525-
static_cast<std::uint32_t>(var_count),
10526-
off_bits.data(),
10527-
static_cast<std::uint32_t>(blocksU),
10528-
out_hits.data());
10641+
if(use_off_gpu)
10642+
{
10643+
used_cuda = cuda_espresso_off_hits(off_gpu, desc.data(), desc.size(), out_hits.data());
10644+
}
10645+
else
10646+
{
10647+
used_cuda = cuda_espresso_cube_hits_off(opt.cuda_device_mask,
10648+
desc.data(),
10649+
desc.size(),
10650+
static_cast<std::uint32_t>(var_count),
10651+
off_bits.data(),
10652+
static_cast<std::uint32_t>(blocksU),
10653+
out_hits.data());
10654+
}
1052910655
}
1053010656
else
1053110657
{
@@ -10569,46 +10695,106 @@ namespace phy_engine::verilog::digital
1056910695
auto expand_cover_batch = [&]() noexcept -> bool
1057010696
{
1057110697
if(cover.empty() || var_order.empty()) { return false; }
10698+
// GPU-friendly structural batching:
10699+
// The classic per-variable expand loop creates millions of tiny "hits-off" batches.
10700+
// Here we generate a larger pool per round (single- and double-literal relaxations),
10701+
// query hits-off in one batched call, then apply the best safe candidate per cube.
1057210702
bool any{};
10573-
bool changed{true};
10574-
::std::vector<std::size_t> idx{};
10703+
constexpr std::size_t max_rounds = 3;
10704+
auto const var_limit = ::std::min<std::size_t>(var_order.size(), 12u);
10705+
10706+
struct cand_meta
10707+
{
10708+
std::size_t cube_idx{};
10709+
std::uint8_t drop_cnt{};
10710+
};
10711+
10712+
::std::vector<cand_meta> meta{};
1057510713
::std::vector<qm_implicant> cands{};
1057610714
::std::vector<std::uint8_t> hits{};
1057710715

10578-
while(changed)
10716+
for(std::size_t round{}; round < max_rounds; ++round)
1057910717
{
10580-
changed = false;
10581-
for(auto const v: var_order)
10718+
meta.clear();
10719+
cands.clear();
10720+
10721+
// Conservative reserve; still bounded.
10722+
meta.reserve(cover.size() * (var_limit + (var_limit * (var_limit - 1u)) / 2u));
10723+
cands.reserve(cover.size() * (var_limit + (var_limit * (var_limit - 1u)) / 2u));
10724+
10725+
for(std::size_t ci{}; ci < cover.size(); ++ci)
1058210726
{
10583-
idx.clear();
10584-
cands.clear();
10585-
idx.reserve(cover.size() / 2u + 1u);
10586-
cands.reserve(cover.size() / 2u + 1u);
10727+
auto const& c0 = cover[ci];
10728+
::std::vector<::std::uint16_t> bits{};
10729+
bits.reserve(var_limit);
10730+
for(std::size_t oi{}; oi < var_limit; ++oi)
10731+
{
10732+
auto const v = var_order[oi];
10733+
auto const bit = static_cast<::std::uint16_t>(1u << v);
10734+
if(((c0.mask >> v) & 1u) != 0u) { continue; }
10735+
bits.push_back(bit);
10736+
}
10737+
if(bits.empty()) { continue; }
1058710738

10588-
auto const bit = static_cast<::std::uint16_t>(1u << v);
10589-
for(std::size_t i{}; i < cover.size(); ++i)
10590-
{
10591-
auto const& c = cover[i];
10592-
if((c.mask >> v) & 1u) { continue; } // already don't care
10593-
qm_implicant cand = c;
10594-
cand.mask = static_cast<::std::uint16_t>(cand.mask | bit);
10595-
cand.value = static_cast<::std::uint16_t>(cand.value & static_cast<::std::uint16_t>(~bit));
10596-
idx.push_back(i);
10739+
for(auto const b1 : bits)
10740+
{
10741+
qm_implicant cand = c0;
10742+
cand.mask = static_cast<::std::uint16_t>(cand.mask | b1);
10743+
cand.value = static_cast<::std::uint16_t>(cand.value & static_cast<::std::uint16_t>(~b1));
10744+
meta.push_back(cand_meta{ci, 1u});
1059710745
cands.push_back(cand);
1059810746
}
10599-
if(cands.empty()) { continue; }
1060010747

10601-
cube_hits_off_batch(cands, hits);
10602-
if(hits.size() != cands.size()) { continue; }
10748+
for(std::size_t i{}; i < bits.size(); ++i)
10749+
{
10750+
for(std::size_t j{i + 1u}; j < bits.size(); ++j)
10751+
{
10752+
auto const b = static_cast<::std::uint16_t>(bits[i] | bits[j]);
10753+
qm_implicant cand = c0;
10754+
cand.mask = static_cast<::std::uint16_t>(cand.mask | b);
10755+
cand.value = static_cast<::std::uint16_t>(cand.value & static_cast<::std::uint16_t>(~b));
10756+
meta.push_back(cand_meta{ci, 2u});
10757+
cands.push_back(cand);
10758+
}
10759+
}
10760+
}
10761+
10762+
if(cands.empty()) { break; }
1060310763

10604-
for(std::size_t k{}; k < cands.size(); ++k)
10764+
cube_hits_off_batch(cands, hits);
10765+
if(hits.size() != cands.size()) { break; }
10766+
10767+
::std::vector<int> best{};
10768+
::std::vector<std::uint8_t> best_drop{};
10769+
best.assign(cover.size(), -1);
10770+
best_drop.assign(cover.size(), 0u);
10771+
10772+
for(std::size_t k{}; k < cands.size(); ++k)
10773+
{
10774+
if(hits[k]) { continue; }
10775+
auto const ci = meta[k].cube_idx;
10776+
if(ci >= cover.size()) { continue; }
10777+
auto const dcnt = meta[k].drop_cnt;
10778+
if(dcnt > best_drop[ci])
1060510779
{
10606-
if(hits[k]) { continue; }
10607-
cover[idx[k]] = cands[k];
10608-
changed = true;
10609-
any = true;
10780+
best_drop[ci] = dcnt;
10781+
best[ci] = static_cast<int>(k);
1061010782
}
1061110783
}
10784+
10785+
bool changed{};
10786+
for(std::size_t ci{}; ci < cover.size(); ++ci)
10787+
{
10788+
auto const bi = best[ci];
10789+
if(bi < 0) { continue; }
10790+
auto const cand = cands[static_cast<std::size_t>(bi)];
10791+
if(cand.mask == cover[ci].mask && cand.value == cover[ci].value) { continue; }
10792+
cover[ci] = cand;
10793+
changed = true;
10794+
any = true;
10795+
}
10796+
10797+
if(!changed) { break; }
1061210798
}
1061310799
return any;
1061410800
};

0 commit comments

Comments
 (0)